/** Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved. Contact: SYSTAP, LLC DBA Blazegraph 2501 Calvert ST NW #106 Washington, DC 20008 licenses@blazegraph.com This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* * Created on Mar 13, 2007 */ package com.bigdata.journal; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.lang.ref.WeakReference; import java.net.InetSocketAddress; import java.nio.ByteBuffer; import java.nio.channels.Channel; import java.nio.channels.FileChannel; import java.rmi.RemoteException; import java.security.DigestException; import java.security.NoSuchAlgorithmException; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.Set; import java.util.TreeMap; import java.util.UUID; import java.util.concurrent.BrokenBarrierException; import java.util.concurrent.Callable; import java.util.concurrent.CancellationException; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Future; import java.util.concurrent.FutureTask; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicReference; import java.util.concurrent.locks.Condition; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReentrantLock; import java.util.concurrent.locks.ReentrantReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock.ReadLock; import java.util.concurrent.locks.ReentrantReadWriteLock.WriteLock; import org.apache.log4j.Logger; import com.bigdata.BigdataStatics; import com.bigdata.btree.AbstractBTree; import com.bigdata.btree.BTree; import com.bigdata.btree.Checkpoint; import com.bigdata.btree.ICheckpointProtocol; import com.bigdata.btree.IIndex; import com.bigdata.btree.IRangeQuery; import com.bigdata.btree.ITuple; import com.bigdata.btree.ITupleIterator; import com.bigdata.btree.IndexInconsistentError; import com.bigdata.btree.IndexMetadata; import com.bigdata.btree.keys.ICUVersionRecord; import com.bigdata.btree.view.FusedView; import com.bigdata.cache.ConcurrentWeakValueCache; import com.bigdata.cache.ConcurrentWeakValueCacheWithTimeout; import com.bigdata.cache.HardReferenceQueue; import com.bigdata.concurrent.FutureTaskMon; import com.bigdata.config.Configuration; import com.bigdata.config.IValidator; import com.bigdata.config.IntegerRangeValidator; import com.bigdata.config.IntegerValidator; import com.bigdata.config.LongRangeValidator; import com.bigdata.config.LongValidator; import com.bigdata.counters.AbstractStatisticsCollector; import com.bigdata.counters.CAT; import com.bigdata.counters.CounterSet; import com.bigdata.counters.ICounterSetAccess; import com.bigdata.counters.Instrument; import com.bigdata.ha.CommitRequest; import com.bigdata.ha.CommitResponse; import com.bigdata.ha.HAGlue; import com.bigdata.ha.HAStatusEnum; import com.bigdata.ha.HATXSGlue; import com.bigdata.ha.IHAPipelineResetRequest; import com.bigdata.ha.IHAPipelineResetResponse; import com.bigdata.ha.IIndexManagerCallable; import com.bigdata.ha.IJoinedAndNonJoinedServices; import com.bigdata.ha.JoinedAndNonJoinedServices; import com.bigdata.ha.PrepareRequest; import com.bigdata.ha.PrepareResponse; import com.bigdata.ha.QuorumService; import com.bigdata.ha.RunState; import com.bigdata.ha.msg.HANotifyReleaseTimeResponse; import com.bigdata.ha.msg.HAReadResponse; import com.bigdata.ha.msg.HARootBlockRequest; import com.bigdata.ha.msg.HARootBlockResponse; import com.bigdata.ha.msg.HAWriteSetStateResponse; import com.bigdata.ha.msg.IHA2PhaseAbortMessage; import com.bigdata.ha.msg.IHA2PhaseCommitMessage; import com.bigdata.ha.msg.IHA2PhasePrepareMessage; import com.bigdata.ha.msg.IHAAwaitServiceJoinRequest; import com.bigdata.ha.msg.IHADigestRequest; import com.bigdata.ha.msg.IHADigestResponse; import com.bigdata.ha.msg.IHAGatherReleaseTimeRequest; import com.bigdata.ha.msg.IHALogDigestRequest; import com.bigdata.ha.msg.IHALogDigestResponse; import com.bigdata.ha.msg.IHALogRequest; import com.bigdata.ha.msg.IHALogRootBlocksRequest; import com.bigdata.ha.msg.IHALogRootBlocksResponse; import com.bigdata.ha.msg.IHANotifyReleaseTimeRequest; import com.bigdata.ha.msg.IHANotifyReleaseTimeResponse; import com.bigdata.ha.msg.IHAReadRequest; import com.bigdata.ha.msg.IHAReadResponse; import com.bigdata.ha.msg.IHARebuildRequest; import com.bigdata.ha.msg.IHARemoteRebuildRequest; import com.bigdata.ha.msg.IHARootBlockRequest; import com.bigdata.ha.msg.IHARootBlockResponse; import com.bigdata.ha.msg.IHASendState; import com.bigdata.ha.msg.IHASendStoreResponse; import com.bigdata.ha.msg.IHASnapshotDigestRequest; import com.bigdata.ha.msg.IHASnapshotDigestResponse; import com.bigdata.ha.msg.IHASnapshotRequest; import com.bigdata.ha.msg.IHASnapshotResponse; import com.bigdata.ha.msg.IHASyncRequest; import com.bigdata.ha.msg.IHAWriteMessage; import com.bigdata.ha.msg.IHAWriteSetStateRequest; import com.bigdata.ha.msg.IHAWriteSetStateResponse; import com.bigdata.ha.msg.Mock2PhaseCommitProtocolException; import com.bigdata.htree.HTree; import com.bigdata.io.ChecksumUtility; import com.bigdata.io.DirectBufferPool; import com.bigdata.io.IDataRecord; import com.bigdata.io.IDataRecordAccess; import com.bigdata.io.SerializerUtil; import com.bigdata.io.writecache.WriteCacheService; import com.bigdata.journal.Name2Addr.Entry; import com.bigdata.mdi.IResourceMetadata; import com.bigdata.mdi.JournalMetadata; import com.bigdata.quorum.AsynchronousQuorumCloseException; import com.bigdata.quorum.Quorum; import com.bigdata.quorum.QuorumActor; import com.bigdata.quorum.QuorumException; import com.bigdata.quorum.QuorumMember; import com.bigdata.quorum.QuorumTokenTransitions; import com.bigdata.rawstore.IAllocationContext; import com.bigdata.rawstore.IAllocationManagerStore; import com.bigdata.rawstore.IPSOutputStream; import com.bigdata.rawstore.IRawStore; import com.bigdata.rawstore.SimpleMemoryRawStore; import com.bigdata.rawstore.WormAddressManager; import com.bigdata.relation.locator.IResourceLocator; import com.bigdata.resources.ResourceManager; import com.bigdata.rwstore.IAllocationManager; import com.bigdata.rwstore.IHistoryManager; import com.bigdata.rwstore.IRWStrategy; import com.bigdata.rwstore.RWStore; import com.bigdata.rwstore.sector.MemStrategy; import com.bigdata.rwstore.sector.MemoryManager; import com.bigdata.service.AbstractHATransactionService; import com.bigdata.service.AbstractTransactionService; import com.bigdata.service.IBigdataFederation; import com.bigdata.util.BytesUtil; import com.bigdata.util.ClocksNotSynchronizedException; import com.bigdata.util.NT; import com.bigdata.util.StackInfoReport; /** * <p> * The journal is a persistence capable data structure supporting atomic commit, * named indices, and full transactions. The {@link BufferMode#DiskRW} mode * provides an persistence scheme based on reusable allocation slots while the * {@link BufferMode#DiskWORM} mode provides an append only persistence scheme. * Journals may be configured in highly available quorums. * </p> * <p> * This class is an abstract implementation of the {@link IJournal} interface * that does not implement the {@link IConcurrencyManager}, * {@link IResourceManager}, or {@link ITransactionService} interfaces. The * {@link Journal} provides a concrete implementation that may be used for a * standalone database complete with concurrency control and transaction * management. * </p> <h2>Limitations</h2> * <p> * The {@link IIndexStore} implementation on this class is NOT thread-safe. The * basic limitation is that the mutable {@link BTree} is NOT thread-safe. The * {@link #getIndex(String)} method exposes this mutable {@link BTree}. If you * use this method to access the mutable {@link BTree} then YOU are responsible * for avoiding concurrent writes on the returned object. * </p> * <p> * See {@link IConcurrencyManager#submit(AbstractTask)} for a thread-safe API * that provides suitable concurrency control for both isolated and unisolated * operations on named indices. Note that the use of the thread-safe API does * NOT protect against applications that directly access the mutable * {@link BTree} using {@link #getIndex(String)}. * </p> * <p> * The {@link IRawStore} interface on this class is thread-safe. However, this * is a low-level API that is not used by directly by most applications. The * {@link BTree} class uses this low-level API to read and write its nodes and * leaves on the store. Applications generally use named indices rather than the * {@link IRawStore} interface. * </p> * <p> * Note: transaction processing MAY occur be concurrent since the write set of a * each transaction is written on a distinct {@link TemporaryStore}. However, * without additional concurrency controls, each transaction is NOT thread-safe * and MUST NOT be executed by more than one concurrent thread. Again, see * {@link IConcurrencyManager#submit(AbstractTask)} for a high-concurrency API * for both isolated operations (transactions) and unisolated operations. Note * that the {@link TemporaryStore} backing a transaction will spill * automatically from memory onto disk if the write set of the transaction grows * too large. * </p> * <h2>Commit processing</h2> * <p> * The journal maintains two root blocks. Commit updates the root blocks using * the Challis algorithm. (The root blocks are updated using an alternating * pattern and "timestamps" are recorded at the head and tail of each root block * to detect partial writes. See {@link IRootBlockView} and * {@link RootBlockView}.) When the journal is backed by a disk file, the data * are {@link Options#FORCE_ON_COMMIT optionally flushed to disk on commit}. If * desired, the writes may be flushed before the root blocks are updated to * ensure that the writes are not reordered - see {@link Options#DOUBLE_SYNC}. * </p> * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a> * * @todo There are lots of annoying ways in which asynchronously closing the * journal, e.g., using {@link #close()} or {@link #shutdown()} can cause * exceptions to be thrown out of concurrent threads. It would be nice if * we could throw a single exception that indicated that the journal had * been asynchronously closed. */ public abstract class AbstractJournal implements IJournal/* , ITimestampService */ , IAllocationManager, IAllocationManagerStore { /** * Logger. */ private static final Logger log = Logger.getLogger(AbstractJournal.class); /** * @see http://sourceforge.net/apps/trac/bigdata/ticket/443 (Logger for * RWStore transaction service and recycler) */ private static final Logger txLog = Logger.getLogger("com.bigdata.txLog"); /** * Logger for HA events. */ protected static final Logger haLog = Logger.getLogger("com.bigdata.haLog"); /** * The index of the root address containing the address of the persistent * {@link Name2Addr} mapping names to {@link BTree}s registered for the * store. */ public static transient final int ROOT_NAME2ADDR = 0; /** * The index of the root address where the root block copy from the previous * commit is stored. */ public static transient final int PREV_ROOTBLOCK = 1; /** * The index of the root address of the delete blocks associated with * this transaction. */ public static transient final int DELETEBLOCK = 2; /** * The index of the root address containing the {@link ICUVersionRecord}. * That record specifies the ICU version metadata which was in force when * the journal was created. */ public static transient final int ROOT_ICUVERSION = 3; /** * A clone of the properties used to initialize the {@link Journal}. */ final protected Properties properties; /** * The #of open journals (JVM wide). This is package private. It is used to * chase down unit tests which are not closing() the Journal. */ final static AtomicInteger nopen = new AtomicInteger(); /** * The #of closed journals (JVM wide). This is package private. It is used * to chase down unit tests which are not {@link #close() closing} the * Journal. */ final static AtomicInteger nclose = new AtomicInteger(); /** * The #of destroyed journals (JVM wide). This is package private. It is * used to chase down unit tests which are not {@link #destroy() destroying} * the journal. */ final static AtomicInteger ndestroy = new AtomicInteger(); /** * The directory that should be used for temporary files. */ final public File tmpDir; /** * The object used by the journal to compute the checksums of its root * blocks (this object is NOT thread-safe so there is one instance per * journal). */ private final ChecksumUtility checker = new ChecksumUtility(); // /* // * These fields were historically marked as [final] and set by the // * constructor. With the introduction of high availability these fields // can // * not be final because the CREATE of the journal must be deferred until a // * quorum leader has been elected. // * // * The pattern for these fields is that they are assigned by create() and // * are thereafter immutable. The fields are marked as [volatile] so the // * state change when they are set will be visible without explicit // * synchronization (many methods use volatile reads on these fields). // */ /** * The metadata for a pre-existing journal -or- <code>null</code> if the * journal was created for the first time. */ private FileMetadata fileMetadata; /** package private method exported to {@link DumpJournal}. */ FileMetadata getFileMetadata() { return fileMetadata; } /** * The implementation logic for the current {@link BufferMode}. */ private final IBufferStrategy _bufferStrategy; /** * A description of the journal as a resource. * <p> * Note: For HA, this is updated if new root blocks are installed onto the * journal. This is necessary since that operation changes the {@link UUID} * of the backing store, which is one of the things reported by the * {@link JournalMetadata} class. * * @see #installRootBlocks(IRootBlockView, IRootBlockView) */ private final AtomicReference<JournalMetadata> journalMetadata = new AtomicReference<JournalMetadata>(); /* * */ /** * The current root block. This is updated each time a new root block is * written. */ private volatile IRootBlockView _rootBlock; /** * The registered committers for each slot in the root block. */ private volatile ICommitter[] _committers = new ICommitter[ICommitRecord.MAX_ROOT_ADDRS]; /* * */ /** * This lock is used to prevent clearing or setting of critical fields while * a concurrent thread is seeking to operate on the referenced objects. A * read-write lock was chosen because nearly all operations are "reads" (the * read the field reference). For this purpose the "writer" is a thread * which causes the value of either of these fields to be changed. The * "writers" are {@link #abort()}, {@link #closeForWrites(long)}, etc. These * methods are invoked relatively infrequently in comparison with "read" * access to these fields. */ private final ReentrantReadWriteLock _fieldReadWriteLock = new ReentrantReadWriteLock(false/* fair */); /** * This lock is needed to synchronize serviceJoin of a <em>follower</em> * with a GATHER. Specifically it ensures that if a service is trying to * join during a replicated write set, then the GATHER and the SERVICE JOIN * are MUTEX. */ private final Lock _gatherLock = new ReentrantLock(); /** * Used to cache the most recent {@link ICommitRecord} -- discarded on * {@link #abort()}; set by {@link #commitNow(long)}. * <p> * Note: This is set in the constructor and modified by {@link #_abort()} * but (once set by the constructor) it is never <code>null</code> until the * store is closed. */ private volatile ICommitRecord _commitRecord; /** * BTree mapping commit timestamps to the address of the corresponding * {@link ICommitRecord}. The keys are timestamps (long integers). The * values are the address of the {@link ICommitRecord} with that commit * timestamp. * <p> * Note: The {@link CommitRecordIndex} object is NOT systematically * protected by <code>synchronized</code> within this class. Therefore it is * NOT safe for use by outside classes and CAN NOT be made safe simply by * synchronizing their access on the {@link CommitRecordIndex} object * itself. This is mainly for historical reasons and it may be possible to * systematically protect access to this index within a synchronized block * and then expose it to other classes. * <p> * Note: This is set in the constructor and modified by {@link #_abort()} * but (once set by the constructor) it is never <code>null</code> until the * store is closed. */ private volatile CommitRecordIndex _commitRecordIndex; /** * The {@link ICUVersionRecord} iff known. */ private volatile ICUVersionRecord _icuVersionRecord; /** * The configured capacity for the {@link HardReferenceQueue} backing the * index cache maintained by the "live" {@link Name2Addr} object. * * @see Options#LIVE_INDEX_CACHE_CAPACITY */ private final int liveIndexCacheCapacity; /** * The configured timeout in milliseconds for stale entries in the * {@link HardReferenceQueue} backing the index cache maintained by the * "live" {@link Name2Addr} object. * * @see Options#LIVE_INDEX_CACHE_TIMEOUT */ private final long liveIndexCacheTimeout; /** * The configured capacity for the LRU backing the * {@link #historicalIndexCache}. * * @see Options#HISTORICAL_INDEX_CACHE_CAPACITY */ private final int historicalIndexCacheCapacity; /** * The configured timeout for stale entries in the * {@link HardReferenceQueue} backing the {@link #historicalIndexCache}. * * @see Options#HISTORICAL_INDEX_CACHE_TIMEOUT */ private final long historicalIndexCacheTimeout; /** * A cache that is used by the {@link AbstractJournal} to provide a * <em>canonicalizing</em> mapping from an address to the instance of a * read-only historical object loaded from that address and which indirectly * controls how long the journal will "keep open" historical index objects * by prevent them from being swept by the garbage collector. * <p> * Note: the "live" version of an object MUST NOT be placed into this cache * since its state will continue to evolve with additional writes while the * cache is intended to provide a canonicalizing mapping to the historical * committed states of the object. This means that objects such as indices * and the {@link Name2Addr} index MUST NOT be inserted into the cache if * the are being read from the store for "live" use. For this reason * {@link Name2Addr} uses its own caching mechanisms. * <p> * Note: {@link #abort()} discards the contents of this cache in order to * ensure that partial writes are discarded. * * @see Options#HISTORICAL_INDEX_CACHE_CAPACITY * @see Options#HISTORICAL_INDEX_CACHE_TIMEOUT * * TODO This should have {@link ICheckpointProtocol} values. We have to * update the * {@link IRWStrategy#registerExternalCache(ConcurrentWeakValueCache, int)} * method in order to make that change. */ // final private WeakValueCache<Long, ICommitter> historicalIndexCache; final private ConcurrentWeakValueCache<Long, ICommitter> historicalIndexCache; /** * A cache that is used to avoid lookups against the * {@link CommitRecordIndex} and {@link Name2Addr} for historical index * views. * <p> * Note: This cache is in front of the {@link #historicalIndexCache} as the * latter is only tested once we have the {@link ICommitRecord} and have * resolved the entry in {@link Name2Addr}. This cache allows us to avoid * both of those steps. * <p> * Note: The {@link #historicalIndexCache} imposes a canonicalizing mapping. * It remains necessary, even with the introduction of the * {@link #indexCache}. * * @see #getIndex(String, long) * @see <a href="http://sourceforge.net/apps/trac/bigdata/ticket/546" > Add * cache for access to historical index views on the Journal by name * and commitTime. </a> */ final private ConcurrentWeakValueCacheWithTimeout<NT, ICheckpointProtocol> indexCache; /** * The "live" BTree mapping index names to the last metadata record * committed for the named index. The keys are index names (unicode * strings). The values are the names and the last known address of the * named btree. * <p> * The "live" name2addr index is required for unisolated writers regardless * whether they are adding an index, dropping an index, or just recovering * the "live" version of an existing named index. * <p> * Operations that read on historical {@link Name2Addr} objects can of * course be concurrent. Those objects are loaded from an * {@link ICommitRecord}. See {@link #getIndex(String, ICommitRecord)}. * <p> * Note: access to the "live" {@link Name2Addr} index MUST be bracketed with * <code>synchronized({@link #_name2Addr})</code>. * * @see #getName2Addr() */ private volatile Name2Addr _name2Addr; /** * An atomic state specifying whether a clean abort is required. This is set * to true by critical section code in the _abort if it does not complete cleanly. * <p> * It is checked in the commit() method ensure updates are protected. * * @see #1021 (Add critical section protection to AbstractJournal.abort() and BigdataSailConnection.rollback()) */ private final AtomicBoolean abortRequired = new AtomicBoolean(false); /** * Return the "live" BTree mapping index names to the last metadata record * committed for the named index. The keys are index names (unicode * strings). The values are the names and the last known address of the * named btree. * <p> * The "live" name2addr index is required for unisolated writers regardless * whether they are adding an index, dropping an index, or just recovering * the "live" version of an existing named index. * <p> * Operations that read on historical {@link Name2Addr} objects can of * course be concurrent. Those objects are loaded from an * {@link ICommitRecord}. See {@link #getIndex(String, ICommitRecord)}. * <p> * Note: the "live" {@link Name2Addr} index is a mutable {@link BTree}. All * access to the object MUST be synchronized on that object. */ protected Name2Addr _getName2Addr() { final ReadLock lock = _fieldReadWriteLock.readLock(); lock.lock(); try { final Name2Addr tmp = _name2Addr; if (tmp == null) throw new AssertionError(); return tmp; } finally { lock.unlock(); } } /** * A read-only view of the {@link Name2Addr} object mapping index names to * the most recent committed {@link Entry} for the named index. The keys are * index names (unicode strings). The values are {@link Entry}s containing * the names, commitTime, and last known {@link Checkpoint} address of the * named {@link BTree} on the {@link Journal}. */ public IIndex getName2Addr() { final ReadLock lock = _fieldReadWriteLock.readLock(); lock.lock(); try { final long checkpointAddr; if (_name2Addr == null) { checkpointAddr = getRootAddr(ROOT_NAME2ADDR); } else { checkpointAddr = _name2Addr.getCheckpoint().getCheckpointAddr(); } /* * Note: This uses the canonicalizing mapping to get an instance * that is distinct from the live #name2Addr object while not * allowing more than a single such distinct instance to exist for * the current name2Addr object. */ final BTree btree = (BTree) getIndexWithCheckpointAddr(checkpointAddr); // References must be distinct. if (_name2Addr == btree) throw new AssertionError(); // /* // * Wrap up in a read-only view since writes MUST NOT be allowed. // */ // return new ReadOnlyIndex(btree); /* * Set the last commit time on the Name2Addr view. * * TODO We can not reliably set the lastCommitTime on Name2Addr in * this method. It will typically be the commitTime associated last * commit point on the store, but it *is* possible to have a commit * on the database where no named indices were dirty and hence no * update was made to Name2Addr. In this (rare, but real) case, the * factual Name2Addr lastCommitTime would be some commit time * earlier than the lastCommitTime reported by the Journal. */ final long lastCommitTime = getLastCommitTime(); if (lastCommitTime != 0L) { btree.setLastCommitTime(lastCommitTime); } return btree; } finally { lock.unlock(); } } /** * Return a read-only view of the {@link Name2Addr} object as of the * specified commit time. * * @param commitTime * A commit time. * * @return The read-only view -or- <code>null</code> if there is no commit * record for that commitTime. * * @see #getName2Addr() */ public IIndex getName2Addr(final long commitTime) { final ICommitRecord commitRecord = getCommitRecord(commitTime); if (commitRecord == null) { return null; } final long checkpointAddr = commitRecord.getRootAddr(ROOT_NAME2ADDR); final Name2Addr n2a = (Name2Addr) getIndexWithCheckpointAddr(checkpointAddr); // return new ReadOnlyIndex(n2a); /* * Set the last commit time on the Name2Addr index view. * * TODO We can not reliably set the lastCommitTime on Name2Addr in this * method. It will typically be the commitTime associated with the * [commitRecord] that we resolved above, but it *is* possible to have a * commit on the database where no named indices were dirty and hence no * update was made to Name2Addr. In this (rare, but real) case, the * factual Name2Addr lastCommitTime would be some commit time earlier * than the commitTime reported by the [commitRecord]. */ final long commitTime2 = commitRecord.getTimestamp(); n2a.setLastCommitTime(commitTime2); return n2a; } // /** // * Return the root block view associated with the commitRecord for the // * provided commit time. This requires accessing the next commit record // * since the previous root block is stored with each record. // * // * @param commitTime // * A commit time. // * // * @return The root block view -or- <code>null</code> if there is no commit // * record for that commitTime. // */ // @Deprecated // This method is unused and lacks a unit test. // IRootBlockView getRootBlock(final long commitTime) { // // /* // * Note: getCommitRecordStrictlyGreaterThan() uses appropriate // * synchronization for the CommitRecordIndex. // */ // final ICommitRecord commitRecord = getCommitRecordStrictlyGreaterThan(commitTime); // // if (commitRecord == null) { // // return null; // // } // // final long rootBlockAddr = commitRecord.getRootAddr(PREV_ROOTBLOCK); // // if (rootBlockAddr == 0) { // // return null; // // } else { // // final ByteBuffer bb = read(rootBlockAddr); // // return new RootBlockView(true /* rb0 - WTH */, bb, checker); // // } // // } // // /** // * // * @param startTime from which to begin iteration // * // * @return an iterator over the committed root blocks // */ // @Deprecated // /* // * This is UNUSED AND NOT SAFE (used only in test suite by // * StressTestConcurrentTx, which I have commented out) and not safe (because // * it lacks the necessary locks to access the CommitRecordIndex). The code // * is also wrong since it visits GT the commitTime when it should visit GTE // * the commitTime. // */ // Iterator<IRootBlockView> getRootBlocks(final long startTime) { // return new Iterator<IRootBlockView>() { // ICommitRecord commitRecord = getCommitRecordIndex().findNext(startTime); // // public boolean hasNext() { // return commitRecord != null; // } // // public IRootBlockView next() { // final long rootBlockAddr = commitRecord.getRootAddr(PREV_ROOTBLOCK); // // commitRecord = getCommitRecordIndex().findNext(commitRecord.getTimestamp()); // // if (rootBlockAddr == 0) { // return null; // } else { // ByteBuffer bb = read(rootBlockAddr); // // return new RootBlockView(true /* rb0 - WTH */, bb, checker); // } // } // // public void remove() { // throw new UnsupportedOperationException(); // } // // }; // } /** * True iff the journal was opened in a read-only mode. */ private final boolean readOnly; /** * Option controls whether the journal forces application data to disk * before updating the root blocks. */ protected final boolean doubleSync; /** * Option controls how the journal behaves during a commit. */ protected final ForceEnum forceOnCommit; /** * Option set by the test suites causes the file backing the journal to be * deleted when the journal is closed. */ protected final boolean deleteOnClose; /** * The maximum extent before a {@link #commit()} will {@link #overflow()}. * In practice, overflow tries to trigger before this point in order to * avoid extending the journal. * * @see Options#MAXIMUM_EXTENT */ private final long maximumExtent; private final long initialExtent; private final long minimumExtension; private RootBlockCommitter m_rootBlockCommitter; /** * The maximum extent before a {@link #commit()} will {@link #overflow()}. * In practice, overflow tries to trigger before this point in order to * avoid extending the journal. * * @see Options#MAXIMUM_EXTENT */ final public long getMaximumExtent() { return maximumExtent; } /** * Resolves the property value (static variant for ctor initialization). * * @see Configuration#getProperty(IIndexManager, Properties, String, String, * String) */ static protected String getProperty(final Properties properties, final String name, final String defaultValue) { return Configuration.getProperty(null/* indexManager */, properties, ""/* * no * namespace */, name, defaultValue); } /** * Resolves the property value. * * @see Configuration#getProperty(IIndexManager, Properties, String, String, * String) */ protected String getProperty(final String name, final String defaultValue) { return Configuration.getProperty(this, properties, ""/* no namespace */, name, defaultValue); } /** * Resolves, parses, and validates the property value. * * @param name * The property name. * @param defaultValue * The default value. * @return */ protected <E> E getProperty(final String name, final String defaultValue, IValidator<E> validator) { return Configuration.getProperty(this, properties, ""/* no namespace */, name, defaultValue, validator); } /** * Create or re-open a journal. * * @param properties * The properties as defined by {@link Options}. * * @throws RuntimeException * If there is a problem when creating, opening, or reading from * the journal file. * * @see Options */ protected AbstractJournal(final Properties properties) { this(properties, null/* quorum */); } /** * Create or re-open a journal as part of a highly available {@link Quorum}. * * @param properties * The properties as defined by {@link Options}. * @param quorum * The quorum with which the journal will join (HA mode only). * * @throws RuntimeException * If there is a problem when creating, opening, or reading from * the journal file. * * @see Options */ protected AbstractJournal(Properties properties, final Quorum<HAGlue, QuorumService<HAGlue>> quorum) { if (properties == null) throw new IllegalArgumentException(); this.properties = properties = (Properties) properties.clone(); // null unless in HA mode. this.quorum = quorum; /* * Set various 'final' properties. */ { historicalIndexCacheCapacity = getProperty(Options.HISTORICAL_INDEX_CACHE_CAPACITY, Options.DEFAULT_HISTORICAL_INDEX_CACHE_CAPACITY, IntegerValidator.GT_ZERO); historicalIndexCacheTimeout = getProperty(Options.HISTORICAL_INDEX_CACHE_TIMEOUT, Options.DEFAULT_HISTORICAL_INDEX_CACHE_TIMEOUT, LongValidator.GTE_ZERO); // historicalIndexCache = new WeakValueCache<Long, ICommitter>( // new LRUCache<Long, ICommitter>(historicalIndexCacheCapacity)); // Cache by addr historicalIndexCache = new ConcurrentWeakValueCacheWithTimeout<Long, ICommitter>(historicalIndexCacheCapacity, TimeUnit.MILLISECONDS.toNanos(historicalIndexCacheTimeout)); // Cache by (name,commitTime). This cache is in front of the cache by addr. indexCache = new ConcurrentWeakValueCacheWithTimeout<NT, ICheckpointProtocol>(historicalIndexCacheCapacity, TimeUnit.MILLISECONDS.toNanos(historicalIndexCacheTimeout)); liveIndexCacheCapacity = getProperty(Options.LIVE_INDEX_CACHE_CAPACITY, Options.DEFAULT_LIVE_INDEX_CACHE_CAPACITY, IntegerValidator.GT_ZERO); liveIndexCacheTimeout = getProperty(Options.LIVE_INDEX_CACHE_TIMEOUT, Options.DEFAULT_LIVE_INDEX_CACHE_TIMEOUT, LongValidator.GTE_ZERO); } initialExtent = getProperty(Options.INITIAL_EXTENT, Options.DEFAULT_INITIAL_EXTENT, new LongRangeValidator( Options.minimumInitialExtent, Long.MAX_VALUE)); maximumExtent = getProperty(Options.MAXIMUM_EXTENT, Options.DEFAULT_MAXIMUM_EXTENT, new LongRangeValidator( initialExtent, Long.MAX_VALUE)); minimumExtension = getProperty(Options.MINIMUM_EXTENSION, Options.DEFAULT_MINIMUM_EXTENSION, new LongRangeValidator(Options.minimumMinimumExtension, Long.MAX_VALUE)); readOnly = (Boolean.parseBoolean(getProperty(Options.READ_ONLY, Options.DEFAULT_READ_ONLY))); forceOnCommit = ForceEnum.parse(getProperty(Options.FORCE_ON_COMMIT, Options.DEFAULT_FORCE_ON_COMMIT)); doubleSync = Boolean.parseBoolean(getProperty(Options.DOUBLE_SYNC, Options.DEFAULT_DOUBLE_SYNC)); deleteOnClose = Boolean.parseBoolean(getProperty(Options.DELETE_ON_CLOSE, Options.DEFAULT_DELETE_ON_CLOSE)); // "tmp.dir" { tmpDir = new File(getProperty(Options.TMP_DIR, System.getProperty("java.io.tmpdir"))); if (!tmpDir.exists()) { if (!tmpDir.mkdirs()) { throw new RuntimeException("Could not create directory: " + tmpDir.getAbsolutePath()); } } if (!tmpDir.isDirectory()) { throw new RuntimeException("Not a directory: " + tmpDir.getAbsolutePath()); } } /* * Create the appropriate IBufferStrategy object. * * Note: the WriteLock is obtained here because various methods such as * _getCommitRecord() assert that the caller is holding the write lock * in order to provide runtime safety checks. */ final WriteLock lock = _fieldReadWriteLock.writeLock(); lock.lock(); try { /* * Peek at the buffer mode as configured in the properties. If it is * a memory-only buffer mode, then we will take one code path. If it * is a disk-backed buffer mode, then FileMetadata will figure out * what the actual mode should be. If it is a new file, then it is * just the mode configured in the Properties. If the file exists, * then it is the mode as recorded in the file. */ if (BufferMode.valueOf( getProperty(Options.BUFFER_MODE, Options.DEFAULT_BUFFER_MODE)).isFullyBuffered()) { /* * Memory only buffer modes. */ if (readOnly) { throw new RuntimeException( "readOnly not supported for transient journals."); } fileMetadata = null; final long createTime = Long.parseLong(getProperty( Options.CREATE_TIME, "" + System.currentTimeMillis())); // Note: Only used by WORM mode stores. final int offsetBits = getProperty( Options.OFFSET_BITS, Integer.toString((this instanceof Journal ? WormAddressManager.SCALE_UP_OFFSET_BITS : WormAddressManager.SCALE_OUT_OFFSET_BITS)), new IntegerRangeValidator( WormAddressManager.MIN_OFFSET_BITS, WormAddressManager.MAX_OFFSET_BITS)); final BufferMode bufferMode = BufferMode.valueOf( getProperty(Options.BUFFER_MODE, Options.DEFAULT_BUFFER_MODE) ); switch (bufferMode) { case Transient: { final boolean useDirectBuffers = Boolean .parseBoolean(getProperty( Options.USE_DIRECT_BUFFERS, Options.DEFAULT_USE_DIRECT_BUFFERS)); _bufferStrategy = new TransientBufferStrategy(offsetBits, initialExtent, 0L/* * soft limit for maximumExtent */, useDirectBuffers); break; } case MemStore: { _bufferStrategy = new MemStrategy(new MemoryManager( DirectBufferPool.INSTANCE, Integer.MAX_VALUE/* maxSectors */, true/*blocking*/, properties)); break; } default: throw new AssertionError("bufferMode=" + bufferMode); } /* * setup the root blocks. */ final int nextOffset = 0; final long firstCommitTime = 0L; final long lastCommitTime = 0L; final long commitCounter = 0L; final long commitRecordAddr = 0L; final long commitRecordIndexAddr = 0L; final UUID uuid = UUID.randomUUID(); // Journal's UUID. final long closedTime = 0L; final long blockSequence = IRootBlockView.NO_BLOCK_SEQUENCE; final StoreTypeEnum storeType = bufferMode.getStoreType(); if (createTime == 0L) { throw new IllegalArgumentException( "Create time may not be zero."); } final IRootBlockView rootBlock0 = new RootBlockView(true, offsetBits, nextOffset, firstCommitTime, lastCommitTime, commitCounter, commitRecordAddr, commitRecordIndexAddr, uuid, // blockSequence,// quorumToken,// 0L, // metaStartAddr 0L, // metaStartBits storeType,// createTime, closedTime, RootBlockView.currentVersion, checker); final IRootBlockView rootBlock1 = new RootBlockView(false, offsetBits, nextOffset, firstCommitTime, lastCommitTime, commitCounter, commitRecordAddr, commitRecordIndexAddr, uuid, // blockSequence,// quorumToken,// 0L, // metaStartAddr 0L, // metaStartBits storeType,// createTime, closedTime, RootBlockView.currentVersion, checker); _bufferStrategy.writeRootBlock(rootBlock0, ForceEnum.No); _bufferStrategy.writeRootBlock(rootBlock1, ForceEnum.No); this._rootBlock = rootBlock1; /* * End memory backed modes. */ } else { /* * Disk backed modes. */ /*final FileMetadata*/ fileMetadata = FileMetadata.createInstance( properties, !(this instanceof Journal), quorumToken); /* * Note: Use the BufferMode as reported by FileMetadata. This * will be the right mode on a restart as it checks what is * actually in the store header / root blocks. */ final BufferMode bufferMode = fileMetadata.bufferMode; // /* // * Note: The caller SHOULD specify an explicit [createTime] when // * its value is critical. The default assigned here does NOT // * attempt to use a clock that is consistent with the commit // * protocol or even a clock that assigns unique timestamps. // */ // final long createTime = Long // .parseLong(getProperty(Options.CREATE_TIME, "" + System.currentTimeMillis())); // // assert createTime != 0L; switch (bufferMode) { case Direct: { /* * Setup the buffer strategy. */ _bufferStrategy = new DirectBufferStrategy(0L/* * soft limit * for * maximumExtent */, fileMetadata); this._rootBlock = fileMetadata.rootBlock; break; } case Mapped: { /* * Setup the buffer strategy. */ /* * Note: the maximumExtent is a hard limit in this case only * since resize is not supported for mapped files. */ _bufferStrategy = new MappedBufferStrategy(maximumExtent /* * hard * limit * for * maximum * extent */, fileMetadata); this._rootBlock = fileMetadata.rootBlock; break; } // case Disk: { // // /* // * Setup the buffer strategy. // */ // // _bufferStrategy = new DiskOnlyStrategy(0L/* // * soft limit for // * maximumExtent // */, // // minimumExtension, // fileMetadata); // // this._rootBlock = fileMetadata.rootBlock; // // break; // // } case Disk: case DiskWORM: { /* * Setup the buffer strategy. */ _bufferStrategy = new WORMStrategy( 0L,// soft limit for maximumExtent minimumExtension,// fileMetadata, // quorum// ); this._rootBlock = fileMetadata.rootBlock; break; } case DiskRW: { /* * Setup the buffer strategy. */ _bufferStrategy = new RWStrategy(fileMetadata, quorum); this._rootBlock = fileMetadata.rootBlock; break; } case TemporaryRW: { /* * Setup the buffer strategy. */ _bufferStrategy = new RWStrategy(fileMetadata, quorum); this._rootBlock = fileMetadata.rootBlock; break; } case Temporary: { /* * Setup the buffer strategy. * * FIXME Add test suite for this buffer mode. It should * support MRMW but is not restart-safe. */ // FIXME Change BufferMode.Temporary to use WORMStrategy _bufferStrategy = new DiskOnlyStrategy(0L/* * soft limit for * maximumExtent */, // minimumExtension, fileMetadata); this._rootBlock = fileMetadata.rootBlock; break; } default: throw new AssertionError(); } } /* * Note: Creating a new journal registers some internal indices but * does NOT perform a commit. Those indices will become restart safe * with the first commit. */ // Save resource description (sets value returned by getUUID()). this.journalMetadata.set(new JournalMetadata(this)); // new or reload from the store root block. this._commitRecord = _getCommitRecord(); // new or re-load commit record index from store via root block. this._commitRecordIndex = _getCommitRecordIndex(); /** * If the store can recycle storage then we must provide a hook to * allow the removal of cached data when it is available for recycling. */ if (_bufferStrategy instanceof IHistoryManager) { final int checkpointRecordSize = getByteCount(_commitRecordIndex .getCheckpoint().getCheckpointAddr()); ((IHistoryManager) _bufferStrategy).registerExternalCache( historicalIndexCache, checkpointRecordSize); } // new or re-load from the store. this._icuVersionRecord = _getICUVersionRecord(); // verify the ICU version. if (this._icuVersionRecord != null && !ICUVersionRecord.newInstance().equals( this._icuVersionRecord)) { final boolean update = Boolean.valueOf(properties.getProperty( Options.UPDATE_ICU_VERSION, "false")); if (!update) { throw new RuntimeException("ICUVersionChange: store=" + this._icuVersionRecord + ", runtime=" + ICUVersionRecord.newInstance()); } } // Give the store a chance to set any committers that it defines. setupCommitters(); // report event. ResourceManager.openJournal(getFile() == null ? null : getFile().toString(), size(), getBufferStrategy() .getBufferMode()); if (txLog.isInfoEnabled()) txLog.info("OPEN-JOURNAL: uuid=" + getUUID() + ", file=" + getFile() + ", bufferMode=" + getBufferStrategy().getBufferMode()); } finally { lock.unlock(); } nopen.incrementAndGet(); } /** * @todo consider making the properties restart safe so that they can be * read from the journal. This will let some properties be specified * on initialization while letting others default or be overridden on * restart. This is trivially accomplished by dedicating a root slot * to a Properties object, or a flattened Properties object serialized * as key-value pairs, in which case the data could just be loaded * into a btree and the btree api could be used to change the * persistent properties as necessary. */ @Override final public Properties getProperties() { return new Properties(properties); } /** * Return the delegate that implements the {@link BufferMode}. * <p> * Note: this method MUST NOT check to see whether the journal is open since * we need to use it if we want to invoke * {@link IBufferStrategy#deleteResources()} and we can only invoke that * method once the journal is closed. */ public IBufferStrategy getBufferStrategy() { return _bufferStrategy; } /** * Service for running arbitrary tasks in support of * {@link IResourceLocator}. There is no concurrency control associated with * this service, but tasks run here may submit tasks to the * {@link ConcurrencyManager}. */ @Override abstract public ExecutorService getExecutorService(); /** * Shutdown the journal (running tasks will run to completion, but no new * tasks will start). * <p> * Note: You SHOULD use this method rather than {@link #close()} for normal * shutdown of the journal. * * @see #shutdownNow() */ @Override synchronized public void shutdown() { // Note: per contract for shutdown. if (!isOpen()) return; if (log.isInfoEnabled()) log.info(""); // close immediately. _close(); if (log.isInfoEnabled()) log.info("Shutdown complete."); } /** * Immediate shutdown (running tasks are canceled rather than being * permitted to complete). * * @see #shutdown() */ @Override synchronized public void shutdownNow() { // Note: per contract for shutdownNow() if (!isOpen()) return; if (log.isInfoEnabled()) log.info(""); // close immediately. _close(); if (log.isInfoEnabled()) log.info("Shutdown complete."); } /** * Closes out the journal iff it is still open. */ @Override protected void finalize() throws Throwable { if (_bufferStrategy.isOpen()) { if (log.isInfoEnabled()) log.info("Closing journal: " + getFile()); shutdownNow(); } } /** * Return counters reporting on various aspects of the journal. */ @Override public CounterSet getCounters() { return CountersFactory.getCounters(this); } /** * Note: A combination of a static inner class and a weak reference to the * outer class are used to avoid the returned {@link CounterSet} having a * hard reference to the outer class while retaining the ability to update * the {@link CounterSet} dynamically as long as the referenced object * exists. * <p> * Note: one-shot counter are NOT used so that the LBS can aggregate the * different values which this counter takes on across different live * journal instances for the same data service. For example, the createTime * for each live journal or the name of the file backing the current live * journal. */ private static class CountersFactory { static public CounterSet getCounters(final AbstractJournal jnl) { final CounterSet counters = new CounterSet(); final WeakReference<AbstractJournal> ref = new WeakReference<AbstractJournal>(jnl); counters.addCounter("file", new Instrument<String>() { @Override public void sample() { final AbstractJournal jnl = ref.get(); if (jnl != null) { final File file = jnl.getFile(); if (file != null) setValue(file.toString()); } } }); counters.addCounter("bufferMode", new Instrument<String>() { @Override public void sample() { final AbstractJournal jnl = ref.get(); if (jnl != null) { final IBufferStrategy bufferStrategy = jnl.getBufferStrategy(); if (bufferStrategy != null) { final BufferMode bufferMode = bufferStrategy.getBufferMode(); if (bufferMode != null) { setValue(bufferMode.toString()); } } } } }); counters.addCounter("groupCommit", new Instrument<Boolean>() { @Override public void sample() { final AbstractJournal jnl = ref.get(); if (jnl != null) { setValue(jnl.isGroupCommit()); } } }); // counters.addCounter("file", new OneShotInstrument<String>("" // + jnl.getFile())); counters.addCounter("createTime", new Instrument<Long>() { @Override public void sample() { final AbstractJournal jnl = ref.get(); if (jnl != null) { final IRootBlockView rootBlock = jnl._rootBlock; if (rootBlock != null) { setValue(rootBlock.getCreateTime()); } } } }); counters.addCounter("closeTime", new Instrument<Long>() { @Override public void sample() { final AbstractJournal jnl = ref.get(); if (jnl != null) { final IRootBlockView rootBlock = jnl._rootBlock; if (rootBlock != null) { setValue(rootBlock.getCloseTime()); } } } }); counters.addCounter("commitCount", new Instrument<Long>() { @Override public void sample() { final AbstractJournal jnl = ref.get(); if (jnl != null) { final IRootBlockView rootBlock = jnl._rootBlock; if (rootBlock != null) { setValue(rootBlock.getCommitCounter()); } } } }); counters.addCounter("historicalIndexCacheSize", new Instrument<Integer>() { @Override public void sample() { final AbstractJournal jnl = ref.get(); if (jnl != null) { setValue(jnl.historicalIndexCache.size()); } } }); counters.addCounter("indexCacheSize", new Instrument<Integer>() { @Override public void sample() { final AbstractJournal jnl = ref.get(); if (jnl != null) { setValue(jnl.indexCache.size()); } } }); counters.addCounter("liveIndexCacheSize", new Instrument<Integer>() { @Override public void sample() { final AbstractJournal jnl = ref.get(); if (jnl != null) { final Name2Addr name2Addr = jnl._name2Addr; if (name2Addr != null) { setValue(name2Addr.getIndexCacheSize()); } } } }); // backing strategy performance counters. counters.attach(jnl._bufferStrategy.getCounters()); // commit protocol performance counters. counters.makePath("commit") .attach(jnl.commitCounters.getCounters()); return counters; } } // /** // * Return the live index counters maintained by the unisolated // * {@link Name2Addr} index iff they are available. These counters are not // * available for a read-only journal. They are also not available if the // * journal has been concurrently shutdown (since the {@link Name2Addr} // * reference will have been cleared). // * <p> // * Note: This is exposed to the {@link Journal} which reports this // * information. // * // * @return The live index counters and <code>null</code> iff they are not // * available. // */ // protected CounterSet getLiveIndexCounters() { // if (!isReadOnly()) { // /* // * These index counters are only available for the unisolated // * Name2Addr view. If this is a read-only journal, then we can not // * report out that information. // */ // final Name2Addr tmp = _name2Addr; // if (tmp != null) { // /* // * Only if the Name2Addr index exists at the moment that we look // * (avoids problems with reporting during concurrent shutdown). // */ // return tmp.getIndexCounters(); // } // } // return null; // } /** * Return a {@link CounterSet} reflecting the named indices that are open or * which have been recently opened. * <p> * Note: This method prefers the live view of the index since this gets us * the most recent metadata for the index depth, #of nodes, #of leaves, etc. * However, when the live view is not currently open, we prefer the most * recent view of the index. * <p> * Note: This scans the live {@link Name2Addr}s internal index cache for the * live indices and then scans the {@link #historicalIndexCache} for the * read-only index views. Only a single {@link CounterSet} is reported for * any given named index. * * @return A new {@link CounterSet} reflecting the named indices that were * open as of the time that this method was invoked. * * @see <a href="http://sourceforge.net/apps/trac/bigdata/ticket/626"> * Expose performance counters for read-only indices </a> */ protected final CounterSet getIndexCounters() { // If we find a live view of an index, we put its name in this set. final Set<String/* name */> foundLive = new HashSet<String>(); final CounterSet tmp = new CounterSet(); /* * First, search Name2Addr's internal cache. */ { final Name2Addr n2a = _name2Addr; if (n2a != null) { // Get the live views from n2a. n2a.getIndexCounters(tmp, foundLive); } } /* * Now snapshot the [indexCache], creating a map (name,ndx) mapping. We * will use the most recent view of each named index. * * Note: There are potentially many views of a given named index opened * against different commit points. Each of these can have distinct * metadata for the writeRetentionQueue, depth, #of nodes, #of leaves, * etc. We are prefering the most recent view. * * TODO Note the #of open views for the index (injected field, but then * we need to do this for all views, including the live view). */ final Map<String/* commitTime */, ICheckpointProtocol> map = new HashMap<String/* name */, ICheckpointProtocol>(); { final Iterator<Map.Entry<NT, WeakReference<ICheckpointProtocol>>> itr = indexCache .entryIterator(); while (itr.hasNext()) { final Map.Entry<NT, WeakReference<ICheckpointProtocol>> e = itr .next(); final NT nt = e.getKey(); final ICheckpointProtocol newVal = e.getValue().get(); if (newVal == null) { // Reference was cleared. continue; } final String name = nt.getName(); final ICheckpointProtocol oldVal = map.get(name); if (oldVal != null) { final long oldTime = oldVal.getLastCommitTime(); final long newTime = newVal.getLastCommitTime(); if (newTime > oldTime) { // Prefer the most recent index view. map.put(name, newVal); } } else { // There is no entry set for this name index. map.put(name, newVal); } } } /* * Now include the CounterSet for each selected read-only index view. */ for (Map.Entry<String/* name */, ICheckpointProtocol> e : map .entrySet()) { final String path = e.getKey(); // name final CounterSet aCounterSet = e.getValue().getCounters(); // Attach the CounterSet tmp.makePath(path).attach(aCounterSet); } return tmp; } @Override final public File getFile() { final IBufferStrategy tmp = getBufferStrategy(); if (tmp == null) return null; return tmp.getFile(); } // /** // * The HA log directory. // * // * @see HAJournal.Options#HA_LOG_DIR // * // * @throws UnsupportedOperationException // * always. // */ // public File getHALogDir() { // // throw new UnsupportedOperationException(); // // } /** * Assert that <code>t1</code> LT <code>t2</code>, where <code>t1</code> and * <code>t2</code> are timestamps obtain such that this relation will be * <code>true</code> if the clocks on the nodes are synchronized. * <p> * Note: Clock synchronization errors can arise across nodes if the nodes * are not using a common network time source. * <p> * Note: Synchronization errors can arise on a single node if the clock is * changed on that node - specifically if the clock is move backwards to * before the most recent commit timestamp. For example, if the timezone is * changed. * * @param serviceId1 * The service that reported the timestamp <code>t1</code>. * @param serviceId2 * The service that reported the timestamp <code>t2</code>. * @param t1 * A timestamp from one service. * @param t2 * A timestamp from the another service. * * @throws ClocksNotSynchronizedException * * @see ClocksNotSynchronizedException */ protected void assertBefore(final UUID serviceId1, final UUID serviceId2, final long t1, final long t2) throws ClocksNotSynchronizedException { // Maximum allowed clock skew. final long maxSkew = getMaximumClockSkewMillis(); ClocksNotSynchronizedException.assertBefore(serviceId1, serviceId2, t1, t2, maxSkew); } /** * The maximum error allowed (milliseconds) in the clocks. This is used by * {@link #assertBefore(UUID, UUID, long, long)} to verify that the clocks * are within some acceptable skew of one another. It is also used by * {@link #nextCommitTimestamp()} where it specifies the maximum clock skew * that will be corrected without operator intervention. * <p> * Note: This is overridden by the HAJournal. * * @see #assertBefore(UUID, UUID, long, long) */ protected long getMaximumClockSkewMillis() { throw new UnsupportedOperationException(); } /** * The HA timeout in milliseconds for a 2-phase prepare. * * @throws UnsupportedOperationException * always. */ public long getHAPrepareTimeout() { throw new UnsupportedOperationException(); } /** * The HA timeout in milliseconds for the release time consensus protocol. * * @throws UnsupportedOperationException * always. */ public long getHAReleaseTimeConsensusTimeout() { throw new UnsupportedOperationException(); } /** * Core implementation of immediate shutdown handles event reporting. */ protected void _close() { assertOpen(); // if (log.isInfoEnabled()) // log.info("file=" + getFile()); _bufferStrategy.close(); // Stop watching for quorum related events. if (quorum != null) quorum.terminate(); // report event. ResourceManager.closeJournal(getFile() == null ? null : getFile().toString()); if (txLog.isInfoEnabled()) txLog.info("CLOSE-JOURNAL: uuid=" + getUUID() + ", file=" + getFile()); // if (LRUNexus.INSTANCE != null) { // // try { // // LRUNexus.INSTANCE.deleteCache(getUUID()); // // } catch (Throwable t) { // // log.error(t, t); // // } // // } if (deleteOnClose) { /* * This option is used by the test suite and MUST NOT be used with * live data. */ deleteResources(); } nclose.incrementAndGet(); } /** * Deletes the backing file(s) (if any). * <p> * Note: This is the core implementation of delete and handles event * reporting. * * @exception IllegalStateException * if the journal is open. */ @Override public void deleteResources() { if (isOpen()) throw new IllegalStateException(); if (log.isInfoEnabled()) log.info(""); final IBufferStrategy bufferStrategy = getBufferStrategy(); if (bufferStrategy != null) { bufferStrategy.deleteResources(); // @see BLZG-1501 (remove LRUNexus) // if (LRUNexus.INSTANCE != null) { // // try { // // LRUNexus.INSTANCE.deleteCache(getUUID()); // // } catch (Throwable t) { // // log.error(t, t); // // } // // } } ResourceManager.deleteJournal(getFile() == null ? null : getFile().toString()); } /** * Truncate the backing buffer such that there is no remaining free space in * the journal. * <p> * Note: The caller MUST have exclusive write access to the journal. When * the {@link ConcurrencyManager} is used, that means that the caller MUST * have an exclusive lock on the {@link WriteExecutorService}. * <p> * Note: The {@link BufferMode#DiskRW} does NOT support this operation. This * is because it stores meta-allocation information at the end of the file, * which makes it impossible to shrink the file. Therefore this method will * return without causing the file on disk to be shrunk for the RWStore. */ public void truncate() { assertOpen(); if (isReadOnly()) throw new IllegalStateException(); final IBufferStrategy backingBuffer = getBufferStrategy(); switch (backingBuffer.getBufferMode()) { case DiskRW: /* * Operation is not supported for the RWStore. */ return; default: break; } final long oldExtent = backingBuffer.getExtent(); final long newExtent = backingBuffer.getHeaderSize() + backingBuffer.getNextOffset(); backingBuffer.truncate(newExtent); if (log.isInfoEnabled()) log.info("oldExtent=" + oldExtent + ", newExtent=" + newExtent); } /** * Make sure that the journal has at least the specified number of bytes of * unused capacity remaining in the user extent. * <p> * Note: You need an exclusive write lock on the journal to extend it. * * @param minFree * The minimum #of bytes free for the user extent. * * @return The #of bytes of free space remaining in the user extent. */ public long ensureMinFree(final long minFree) { assertOpen(); if (minFree < 0L) throw new IllegalArgumentException(); final IBufferStrategy buf = _bufferStrategy; final long remaining = buf.getUserExtent() - buf.getNextOffset(); if (remaining < minFree) { buf.truncate(buf.getExtent() + minFree); } return buf.getUserExtent() - buf.getNextOffset(); } /** * Restart safe conversion of the store into a read-only store with the * specified <i>closeTime</i>. * <p> * This implementation sets the "closeTime" on the root block such that the * journal will no longer accept writes, flushes all buffered writes, and * releases any write cache buffers since they will no longer be used. This * method is normally used when one journal is being closed out for writes * during synchronous overflow processing and new writes will be buffered on * a new journal. This has advantages over closing the journal directly * including that it does not disturb concurrent readers. * <p> * Note: The caller MUST have exclusive write access to the journal. When * the {@link ConcurrencyManager} is used, that means that the caller MUST * have an exclusive lock on the {@link WriteExecutorService}. * <p> * Note: This does NOT perform a commit - any uncommitted writes will be * discarded. * * @throws IllegalStateException * If there are no commits on the journal. * * @todo There should also be an option to convert a journal from * {@link BufferMode#Direct} to {@link BufferMode#Disk}. We would want * to do that not when the journal is sealed but as soon as * asynchronous overflow processing is done. Ideally this will not * require us to close and reopen the journal since that will disturb * concurrent readers. */ public void closeForWrites(final long closeTime) { final WriteLock lock = _fieldReadWriteLock.writeLock(); lock.lock(); try { final long lastCommitTime = _rootBlock.getLastCommitTime(); if (log.isInfoEnabled()) log.info("Closing journal for further writes: closeTime=" + closeTime + ", lastCommitTime=" + lastCommitTime); if (log.isDebugEnabled()) log.debug("before: " + _rootBlock); final IRootBlockView old = _rootBlock; if (old.getCommitCounter() == 0L) { throw new IllegalStateException("No commits on journal"); } // release any unused space. truncate(); /* * Create the final root block. * * Note: We MUST bump the commitCounter in order to have the new * root block be selected over the old one! * * Note: This will throw an error if nothing has ever been committed * on the journal. The problem is that the root block does not * permit a non-zero commitCounter unless the commitRecordAddr and * perhaps some other stuff are non-zero as well. */ final long metaStartAddr = _bufferStrategy.getMetaStartAddr(); final long metaBitsAddr = _bufferStrategy.getMetaBitsAddr(); final IRootBlockView newRootBlock = new RootBlockView(// !old.isRootBlock0(), old.getOffsetBits(), old.getNextOffset(), old.getFirstCommitTime(), old .getLastCommitTime(), // old.getCommitCounter() + 1, // old.getCommitRecordAddr(), // old.getCommitRecordIndexAddr(), // old.getUUID(), // 0L, // blockSequence (writes are discarded) quorumToken, // metaStartAddr, // metaBitsAddr, // old.getStoreType(), // old.getCreateTime(), closeTime, // old.getVersion(), checker); /* * Write it on the store. * * Note: We request that the write is forced to disk to ensure that * all buffered writes are forced to the disk. This is necessary in * order to make sure that the updated root block (and anything left * in the write cache for the disk buffer) get forced through onto * the disk. We do not need to specify ForceMetadata here since the * file size is unchanged by this operation. */ _bufferStrategy.writeRootBlock(newRootBlock, ForceEnum.Force); // discard write cache and make store read-only. _bufferStrategy.closeForWrites(); // replace the root block reference. _rootBlock = newRootBlock; if (log.isDebugEnabled()) log.debug("after: " + _rootBlock); // discard current commit record and re-read from the store. _commitRecord = _getCommitRecord(); /* * FIXME Verify that we can safely convert the writeRetentionQueue * and readOnly flags on the BTree and then re-enable this code * block. The tricky issue is the safe publication of the change to * the writeRetentionQueue field (and populating it with the old * queue's data) and the readOnly field. If those changes are not * safely published then I also need to consider what the side * effects of inconsistent views of those fields might be. One way * to handle the safe publication is using AtomicBoolean and * AtomicReference for those fields. */ // /* Convert all of the unisolated BTree objects into // read-historical // * BTrees as of the lastCommitTime on the journal. This is done in // * order to benefit from any data buffered by those BTrees since // * buffered data is data that we don't need to read from disk and // we // * don't need to de-serialize. This is especially important for // * asynchronous overflow processing which performs full index // scans // * of the BTree's shortly after synchronous overflow process (and // * which is the main reason why closeForWrites() exists). // * // * Note: The caller already promises that they hold the exclusive // * write lock so we don't really need to synchronize on // [name2Addr]. // * // * Note: If we find a dirty mutable BTree then we ignore it rather // * than repurposing it. This allows the possibility that there are // * uncommitted writes. // */ // synchronized (_name2Addr) { // // final Iterator<Map.Entry<String, WeakReference<BTree>>> itr = // _name2Addr // .indexCacheEntryIterator(); // // while (itr.hasNext()) { // // final java.util.Map.Entry<String, WeakReference<BTree>> entry = // itr // .next(); // // final String name = entry.getKey(); // // final BTree btree = entry.getValue().get(); // // if (btree == null) { // // // Note: Weak reference was cleared. // continue; // // } // // if (btree.needsCheckpoint()) { // // // Note: Don't convert a dirty BTree. // continue; // // } // // // Recover the Entry which has the last checkpointAddr. // final Name2Addr.Entry _entry = _name2Addr.getEntry(name); // // if (_entry == null) { // // /* // * There must be an Entry for each index in Name2Addr's // * cache. // */ // // throw new AssertionError("No entry: name=" + name); // // } // // /* // * Mark the index as read-only (the whole journal no longer // * accepts writes) before placing it in the historical index // * cache (we don't want concurrent requests to be able to // * obtain a BTree that is not marked as read-only from the // * historical index cache). // */ // // btree.convertToReadOnly(); // // /* // * Put the BTree into the historical index cache under that // * checkpointAddr. // * // * Note: putIfAbsent() avoids the potential problem of // * having more than one object for the same checkpointAddr. // */ // // historicalIndexCache.putIfAbsent(_entry.checkpointAddr, // btree); // // } // next index. // // // discard since no writers are allowed. // _name2Addr = null; // // } // close(); } finally { lock.unlock(); } } /** * Invokes {@link #shutdownNow()}. */ @Override synchronized public void close() { // Note: per contract for close(). if (!isOpen()) throw new IllegalStateException(); if (log.isInfoEnabled()) log.info(""); shutdownNow(); } @Override synchronized public void destroy() { if (log.isInfoEnabled()) log.info(""); if (isOpen()) shutdownNow(); if (!deleteOnClose) { /* * Note: if deleteOnClose was specified then the resource was * already deleted by _close(). */ deleteResources(); } ndestroy.incrementAndGet(); } /** * Assert that the store is open. * <p> * Note: You can see an {@link IllegalStateException} thrown out of here if * there are tasks running during {@link #shutdown()} and one of the various * task services times out while awaiting termination. Such exceptions are * normal since the store was closed asynchronously while task(s) were still * running. * * @exception IllegalStateException * if the store is closed. */ protected void assertOpen() { if (_bufferStrategy != null && !_bufferStrategy.isOpen()) { throw new IllegalStateException((getFile()==null?"transient":"file=" + getFile())); } } @Override final public UUID getUUID() { return journalMetadata.get().getUUID(); } @Override final public IResourceMetadata getResourceMetadata() { return journalMetadata.get(); } /** * {@inheritDoc} * <p> * Note: This will report <code>false</code> for a new highly available * journal until the quorum has met and {@link #init()} has been invoked for * the {@link Quorum}. */ @Override public boolean isOpen() { return _bufferStrategy != null && _bufferStrategy.isOpen(); } /** * Return <code>true</code> if the journal was opened in a read-only mode or * if {@link #closeForWrites(long)} was used to seal the journal against * further writes. */ @Override public boolean isReadOnly() { if (readOnly) { // Opened in a read-only mode. return true; } if (getRootBlockView().getCloseTime() != 0L) { // Closed for writes. return true; } final long token = this.quorumToken; /* * This code path is too expensive and has been observed to deadlock. * Turn this into a non-blocking code path through correct maintenance * of the haReadyToken and the haStatus fields. */ // if (token != Quorum.NO_QUORUM) { // // // Quorum exists, are we the leader for that token? // final boolean isLeader = quorum.getClient().isLeader(token); // // // read-only unless this is the leader. // return !isLeader; // // } /* * This code path is completely non-blocking. It relies on volatile * writes on [quorumToken] and [haStatus]. * * The rule is read-only if there is a met quorum unless this is the * leader and read/write if there is no quorum or if the quorum is not * met. */ if (token != Quorum.NO_QUORUM) { switch (haStatus) { case Leader: // read/write return false; case Follower: // read-only return true; case NotReady: /* * This case is considered "read-only" locally, but not * available for reads by the HA layer (REST API, SPARQL, etc). */ return true; default: throw new AssertionError(); } } /* * Note: Default for HA permits read/write access when the quorum is not * met. This allows us to make local changes when setting up the * service, doing resync, etc. */ return false; } /** * Assert that the journal is readable. * * @throws IllegalStateException * if the journal is not writable at this time. */ protected void assertCanRead() { if (_bufferStrategy == null) { // only possible during the constructor call. throw new IllegalStateException(); } if (!_bufferStrategy.isOpen()) { throw new IllegalStateException(); } } /** * Assert that the journal is writable. * * @throws IllegalStateException * if the journal is not writable at this time. */ protected void assertCanWrite() { if (_bufferStrategy == null) { // only possible during the constructor call. throw new IllegalStateException(); } if (!_bufferStrategy.isOpen()) { throw new IllegalStateException(); } if (_bufferStrategy.isReadOnly()) { throw new IllegalStateException(); } if (abortRequired.get()) { /** * Do not permit mutation if an abort must be performed. * * @see http://jira.blazegraph.com/browse/BLZG-181 (Add critical * section protection to AbstractJournal.abort() and * BigdataSailConnection.rollback()) * @see http://jira.blazegraph.com/browse/BLZG-1236 (Recycler error * in 1.5.1) */ throw new AbortRequiredException(); } } @Override public boolean isStable() { return _bufferStrategy.isStable(); } @Override public boolean isFullyBuffered() { return _bufferStrategy.isFullyBuffered(); } public boolean isDoubleSync() { return doubleSync; } /** * Return <code>true</code> if the persistence store uses record level * checksums. When <code>true</code>, the store will detect bad reads by * comparing the record as read from the disk against the checksum for that * record. */ public boolean isChecked() { return _bufferStrategy.useChecksums(); } // /** // * Return <code>true</code> if the journal is configured for high // * availability. // * // * @see Quorum#isHighlyAvailable() // */ // public boolean isHighlyAvailable() { // // return quorum == null ? false : quorum.isHighlyAvailable(); // // } /** * {@inheritDoc} * <p> * Returns the current root block (immediate, non-blocking peek). * <p> * Note: The root block reference can be <code>null</code> until the journal * has been initialized. Once it has been set, the root block will always be * non-<code>null</code>. Since this method does not obtain the inner lock, * it is possible for another thread to change the root block reference * through a concurrent {@link #abort()} or {@link #commitNow(long)}. The * {@link IRootBlockView} itself is an immutable data structure. */ @Override final public IRootBlockView getRootBlockView() { // final ReadLock lock = _fieldReadWriteLock.readLock(); // // lock.lock(); // // try { if (_rootBlock == null) { /* * This can happen before the journal file has been created. * Once it has been created the root block will always be * non-null when viewed while holding the lock. */ throw new IllegalStateException(); } return _rootBlock; // } finally { // // lock.unlock(); // // } } /** * Variant of {@link #getRootBlockView()} that takes the internal lock in * order to provide an appropriate synchronization barrier when installing * new root blocks onto an empty journal in HA. * * @see #installRootBlocks(IRootBlockView, IRootBlockView) */ final public IRootBlockView getRootBlockViewWithLock() { final ReadLock lock = _fieldReadWriteLock.readLock(); lock.lock(); try { if (_rootBlock == null) { /* * This can happen before the journal file has been created. * Once it has been created the root block will always be * non-null when viewed while holding the lock. */ throw new IllegalStateException(); } return _rootBlock; } finally { lock.unlock(); } } @Override final public long getLastCommitTime() { // final ReadLock lock = _fieldReadWriteLock.readLock(); // // lock.lock(); // // try { return _rootBlock.getLastCommitTime(); // } finally { // // lock.unlock(); // // } } /** * Set a persistence capable data structure for callback during the commit * protocol. * <p> * Note: the committers must be reset after restart or whenever the * committers are discarded (the committers are themselves transient * objects). * * @param rootSlot * The slot in the root block where the address of the * {@link ICommitter} will be recorded. * * @param committer * The commiter. */ @Override final public void setCommitter(final int rootSlot, final ICommitter committer) { assertOpen(); _committers[rootSlot] = committer; } /** * Notify all registered committers and collect their reported root * addresses in an array. * * @param commitTime * The timestamp assigned to the commit. * * @return The array of collected root addresses for the registered * committers. */ final private long[] notifyCommitters(final long commitTime) { assert commitTime > 0L; // int ncommitters = 0; final long[] rootAddrs = new long[_committers.length]; for (int i = 0; i < _committers.length; i++) { final ICommitter committer = _committers[i]; if (committer == null) continue; final long addr = committer.handleCommit(commitTime); rootAddrs[i] = addr; // ncommitters++; } return rootAddrs; } @Override public void abort() { final WriteLock lock = _fieldReadWriteLock.writeLock(); lock.lock(); try { if (quorum != null) { try { // HA mode quorum.getClient().abort2Phase(quorumToken); } catch (Throwable t) { haLog.error( "2-Phase abort failure. Will do local abort. cause=" + t, t); // Low-level abort. doLocalAbort(); } } else { // Non-HA mode. doLocalAbort(); } } catch (Throwable e) { throw new RuntimeException(e); } finally { lock.unlock(); } } /** * Discards any unisolated writes since the last {@link #commitNow(long)()} * and also discards the unisolated (aka live) btree objects, reloading them * from the current {@link ICommitRecord} on demand. * <p> * Note: The {@link WriteExecutorService} handles commit group and uses an * index {@link Checkpoint} strategy so that it is able to abort individual * tasks simply by discarding their changes and without interrupting * concurrent writers. An {@link #abort()} is therefore an action of last * resort and is generally triggered by things such as running out of disk * space or memory within the JVM. * <p> * If a {@link Thread}s is interrupted in the midst of an IO operation on a * {@link Channel} then the channel will be asynchronously closed by the * JDK. Since some {@link IBufferStrategy}s use a {@link FileChannel} to * access the backing store, this means that we need to re-open the backing * store transparently so that we can continue operations after the commit * group was aborted. This is done automatically when we re-load the current * {@link ICommitRecord} from the root blocks of the store. */// TODO Could merge with doLocalAbort(). private void _abort() { // @see #1021 (Add critical section protection to AbstractJournal.abort() and BigdataSailConnection.rollback()) boolean success = false; final WriteLock lock = _fieldReadWriteLock.writeLock(); lock.lock(); try { if (log.isInfoEnabled()) log.info("ABORT", new StackInfoReport("ABORT")); // Clear gatherFuture.set(null/* newValue */); if (_bufferStrategy == null) { // Nothing to do. success = true; return; } txLog.info("ABORT"); // @see BLZG-1501 (remove LRUNexus) // if (LRUNexus.INSTANCE != null) { // // /* // * Discard the LRU for this store. It may contain writes which // * have been discarded. The same addresses may be reissued by // * the WORM store after an abort, which could lead to incorrect // * reads from a dirty cache. // * // * FIXME An optimization would essentially isolate the writes on // * the cache per BTree or between commits. At the commit point, // * the written records would be migrated into the "committed" // * cache for the store. The caller would read on the uncommitted // * cache, which would read through to the "committed" cache. // * This would prevent incorrect reads without requiring us to // * throw away valid records in the cache. This could be a // * significant performance gain if aborts are common on a // * machine with a lot of RAM. // */ // // LRUNexus.getCache(this).clear(); // // } invalidateCommitters(); /* * The buffer strategy has a hook which is used to discard buffered * writes. This is both an optimization (it ensures that those * writes are not asynchronously laid down on the disk) and a * requirement for the WORM store. * * Note: The WriteCache for the WORM store has internal state giving * the firstOffset of the records in the internal buffer - if we do * not call reset() on that write cache then the firstOffset will be * incorrect and the records will be laid down on the wrong offsets * on the store. This correctness issue does not arise for the RW * store because the WriteCache has the actual offset at which each * record will be written and ordered writes are used to lay down * those records. However, for the WORM store we use a single large * write and require that the data in the buffer exactly matches the * target state on the backing file. */ _bufferStrategy.abort(); /* * The Name2Addr reference will be discarded below. This should be * sufficient to ensure that any index requested by the methods on * the AbstractJournal will be re-read from disk using the commit * record which we re-load below. This is necessary in order to * discard any checkpoints that may have been written on indices * since the last commit (dirty indices that have not been * checkpointed to the disk are discarded when we discard * Name2Addr). * * Note: Historical index references should NOT be discarded on * abort as they remain valid. Discarding them admits the * possibility of a non-canonicalizing cache for the historical * indices since an existing historical index reference will * continue to be held but a new copy of the index will be loaded on * the next request if we clear the cache here. */ // historicalIndexCache.clear(); // discard the commit record and re-read from the store. _commitRecord = _getCommitRecord(); /* * Re-load the commit record index from the address in the current * root block. * * Note: This may not be strictly necessary since the only time we * write on this index is a single record during each commit. So, it * should be valid to simply catch an error during a commit and * discard this index forcing its reload. However, doing this here * is definitely safer. * * Note: This reads on the store. If the backing channel for a * stable store was closed by an interrupt, e.g., during an abort, * then this will cause the backing channel to be transparent * re-opened. At that point both readers and writers will be able to * access the channel again. */ // clear reference and reload from the store. _commitRecordIndex = _getCommitRecordIndex(); // clear reference and reload from the store. _icuVersionRecord = _getICUVersionRecord(); // clear the array of committers. _committers = new ICommitter[_committers.length]; // discard any hard references that might be cached. discardCommitters(); /* * Setup new committers, e.g., by reloading from their last root * addr. */ setupCommitters(); if (quorum != null) { /* * In HA, we need to tell the QuorumService that the database * has done an abort() so it can discard any local state * associated with the current write set (the HALog file and the * last live HA message). */ QuorumService<HAGlue> localService = null; try { localService = quorum.getClient(); } catch (IllegalStateException ex) { /* * Note: Thrown if the QuorumService is not running. */ // ignore. } if (localService != null) { localService.discardWriteSet(); } } if (log.isInfoEnabled()) log.info("done"); success = true; // mark successful abort. } catch (Throwable e) { log.error("ABORT FAILED!", e); throw new RuntimeException("ABORT FAILED", e); } finally { // @see #1021 (Add critical section protection to AbstractJournal.abort() and BigdataSailConnection.rollback()) abortRequired.set(!success); lock.unlock(); } } /** * Rollback a journal to its previous commit point. * <p> * Note: You MUST have an exclusive write lock on the journal. * <p> * Note: To restore the last root block we copy the alternative root block * over the current root block. That gives us two identical root blocks and * restores us to the root block that was in effect before the last commit. * * @deprecated Do not use this method. HA provides point in time restore. Use * that. Or you can open a journal using the alternate root block by specifying * {@link Options#ALTERNATE_ROOT_BLOCK} */ public void rollback() { final WriteLock lock = _fieldReadWriteLock.writeLock(); lock.lock(); try { assertOpen(); if (isReadOnly()) throw new IllegalStateException(); txLog.warn("ROLLBACK"); /* * Read the alternate root block (NOT the current one!). */ final ByteBuffer buf = _bufferStrategy.readRootBlock(!_rootBlock.isRootBlock0()); /* * Create a view from the alternate root block, but using the SAME * [rootBlock0] flag state as the current root block so that this * will overwrite the current root block. */ final IRootBlockView newRootBlock = new RootBlockView(_rootBlock.isRootBlock0(), buf, checker); /* * Overwrite the current root block on the backing store with the * state of the alternate root block. */ _bufferStrategy.writeRootBlock(newRootBlock, forceOnCommit); // Use the new root block. _rootBlock = newRootBlock; /* * Discard all in-memory state - it will need be re-loaded from the * restored root block. */ abort(); /** * Ensure cache is clear to prevent access to invalid BTree from last * commit point */ historicalIndexCache.clear(); indexCache.clear(); } finally { lock.unlock(); } } // /** // * Return the object providing the {@link AbstractLocalTransactionManager} // * for this journal. // */ // abstract public AbstractLocalTransactionManager getLocalTransactionManager(); @Override public boolean isDirty() { return _bufferStrategy.isDirty(); } /** * Get timestamp that will be assigned to this commit point. * <P> * Note: This will spin until commit time advances over * <code>lastCommitTime</code>, but not for more than N milliseconds. This * will allow us to ensure that time moves forward when the leader fails * over to another node with modest clock skew. If there is a large clock * skew, the operator intervention will be required. * <p> * Note: This also makes sense for a non-HA deployment since we still want * time to move forward at each commit point. * * TODO This also makes sense when the Journal is opened since we often * issue queries against historical commit points on the journal based on * the clock. [Unit test for this in standalone and HA modes?] */ private long nextCommitTimestamp() { final IRootBlockView rootBlock = _rootBlock; final long lastCommitTime = rootBlock.getLastCommitTime(); if (lastCommitTime < 0) throw new RuntimeException( "Last commit time is invalid in rootBlock: " + rootBlock); final long commitTime; { final ILocalTransactionManager transactionManager = getLocalTransactionManager(); boolean warned = false; while (true) { final long t = transactionManager.nextTimestamp(); if (t > lastCommitTime) { /* * We have a distinct timestamp. Time is moving forward. */ commitTime = t; break; } /* * Time is going backwards. Figure out by how much. * * Note: delta is in ms. */ final long delta = Math.abs(t - lastCommitTime); if (delta > getMaximumClockSkewMillis()/* ms */) throw new ClocksNotSynchronizedException("Clocks off by " + delta + " ms: lastCommitTime=" + lastCommitTime + ", but localTimestamp=" + t); if (!warned) { log.warn("Clocks off by " + delta + " ms: lastCommitTime=" + lastCommitTime + ", but localTimestamp=" + t); warned = true; } try { // Wait for the delta to expire. Thread.sleep(delta/* ms */); } catch (InterruptedException ex) { // Propagate interrupt. Thread.currentThread().interrupt(); } } } return commitTime; } @Override public long commit() { // The timestamp to be assigned to this commit point. final long commitTime = nextCommitTimestamp(); // do the commit. final IRootBlockView lastRootBlock = _rootBlock; final long commitTime2; try { commitTime2 = commitNow(commitTime); } catch (Throwable t) { throw new RuntimeException(t.getLocalizedMessage() + ": lastRootBlock=" + lastRootBlock, t); } if (commitTime2 == 0L) { // Nothing to commit. return 0L; } // commitNow() should return either 0L or the commitTime we gave it. assert commitTime2 == commitTime; /* * Now that we have committed the data we notify the federation that it * should advance its global lastCommitTime. * * @todo we could use IBufferStrategy#rollback() if the notice failed, * e.g., due to a service outage. */ getLocalTransactionManager().notifyCommit(commitTime); return commitTime; } /** * Performance counters for the journal-level commit operations. */ private static class CommitCounters implements ICounterSetAccess { /** * Elapsed nanoseconds for the {@link ICommitter#handleCommit(long)} * (flushing dirty pages from the indices into the write cache service). */ private final CAT elapsedNotifyCommittersNanos = new CAT(); /** * Elapsed nanoseconds for {@link CommitState#writeCommitRecord()}. * Note: This is also responsible for recycling the deferred frees for * {@link IHistoryManager} backends. */ private final CAT elapsedWriteCommitRecordNanos = new CAT(); /** * Elapsed nanoseconds for flushing the write set from the write cache * service to the backing store (this is the bulk of the disk IO unless * the write cache service fills up during a long running commit, in * which case there is also incremental eviction). */ private final CAT elapsedFlushWriteSetNanos = new CAT(); /** * Elapsed nanoseconds for the simple atomic commit (non-HA). This * consists of sync'ing the disk (iff double-sync is enabled), writing * the root block, and then sync'ing the disk. */ private final CAT elapsedSimpleCommitNanos = new CAT(); /** * Elapsed nanoseconds for the entire commit protocol. */ private final CAT elapsedTotalCommitNanos = new CAT(); // // HA counters // /** * Elapsed nanoseconds for GATHER (consensus release time protocol : HA * only). */ private final CAT elapsedGatherNanos = new CAT(); /** * Elapsed nanoseconds for PREPARE (2-phase commit: HA only). */ private final CAT elapsedPrepare2PhaseNanos = new CAT(); /** * Elapsed nanoseconds for COMMIT2PHASE (2-phase commit: HA only). */ private final CAT elapsedCommit2PhaseNanos = new CAT(); @Override public CounterSet getCounters() { final CounterSet root = new CounterSet(); root.addCounter("notifyCommittersSecs", new Instrument<Double>() { @Override public void sample() { final double secs = (elapsedNotifyCommittersNanos.get() / 1000000000.); setValue(secs); } }); root.addCounter("writeCommitRecordSecs", new Instrument<Double>() { @Override public void sample() { final double secs = (elapsedWriteCommitRecordNanos.get() / 1000000000.); setValue(secs); } }); root.addCounter("flushWriteSetSecs", new Instrument<Double>() { @Override public void sample() { final double secs = (elapsedFlushWriteSetNanos.get() / 1000000000.); setValue(secs); } }); root.addCounter("simpleCommitSecs", new Instrument<Double>() { @Override public void sample() { final double secs = (elapsedSimpleCommitNanos.get() / 1000000000.); setValue(secs); } }); root.addCounter("totalCommitSecs", new Instrument<Double>() { @Override public void sample() { final double secs = (elapsedTotalCommitNanos.get() / 1000000000.); setValue(secs); } }); // // HA // root.addCounter("gatherSecs", new Instrument<Double>() { @Override public void sample() { final double secs = (elapsedGatherNanos.get() / 1000000000.); setValue(secs); } }); root.addCounter("prepare2PhaseSecs", new Instrument<Double>() { @Override public void sample() { final double secs = (elapsedPrepare2PhaseNanos.get() / 1000000000.); setValue(secs); } }); root.addCounter("commit2PhaseSecs", new Instrument<Double>() { @Override public void sample() { final double secs = (elapsedCommit2PhaseNanos.get() / 1000000000.); setValue(secs); } }); return root; } } final private CommitCounters commitCounters = new CommitCounters(); /** * Class to which we attach all of the little pieces of state during * {@link AbstractJournal#commitNow(long)}. * <p> * The non-final fields in this class are laid directly below the method * which set those fields. The methods in the class are laid out in the * top-to-bottom order in which they are executed by commitNow(). */ static private class CommitState { /** * The timestamp at which the commit began. */ private final long beginNanos; /** * The backing store. */ private final AbstractJournal store; /** * The backing {@link IBufferStrategy} for the {@link #store}. */ private final IBufferStrategy _bufferStrategy; /** * The quorum iff HA and <code>null</code> otherwise. */ private final Quorum<HAGlue, QuorumService<HAGlue>> quorum; /** * Local HA service implementation (non-Remote) and <code>null</code> if * not in an HA mode.. */ private final QuorumService<HAGlue> quorumService; /** * The commit time either of a transaction or of an unisolated commit. * Note that when mixing isolated and unisolated commits you MUST use * the same {@link ITimestampService} for both purposes. */ private final long commitTime; /** * The current root block on the journal as of the start of the commit * protocol. */ private final IRootBlockView old; /** * The quorum token associated with this commit point. */ private final long commitToken; // /** The #of bytes on the journal as of the previous commit point. */ // private final long byteCountBefore; /** * The commit counter that will be assigned to the new commit point. */ private final long newCommitCounter; /** * * @param store * The backing store. * @param commitTime * The commit time either of a transaction or of an * unisolated commit. Note that when mixing isolated and * unisolated commits you MUST use the same * {@link ITimestampService} for both purposes. */ public CommitState(final AbstractJournal store, final long commitTime) { if (store == null) throw new IllegalArgumentException(); this.beginNanos = System.nanoTime(); this.store = store; this.commitTime = commitTime; this._bufferStrategy = store._bufferStrategy; // Note: null if not HA. this.quorum = store.quorum; /* * Local HA service implementation (non-Remote). * * Note: getClient() throws IllegalStateException if quorum exists * and is not not running. */ this.quorumService = quorum == null ? null : quorum.getClient(); this.old = store._rootBlock; // // #of bytes on the journal as of the previous commit point. // this.byteCountBefore = store._rootBlock.getNextOffset(); this.newCommitCounter = old.getCommitCounter() + 1; this.commitToken = store.quorumToken; store.assertCommitTimeAdvances(commitTime); } /** * Notify {@link ICommitter}s to flush out application data. This sets * the {@link #rootAddrs} for the {@link ICommitRecord}. * * @return <code>true</code> if the store is dirty and the commit should * proceed and <code>false</code> otherwise. */ private boolean notifyCommitters() { final long beginNanos = System.nanoTime(); /* * First, run each of the committers accumulating the updated root * addresses in an array. In general, these are btrees and they may * have dirty nodes or leaves that needs to be evicted onto the * store. The first time through, any newly created btrees will have * dirty empty roots (the btree code does not optimize away an empty * root at this time). However, subsequent commits without * intervening data written on the store should not cause any * committers to update their root address. * * Note: This also checkpoints the deferred free block list. */ rootAddrs = store.notifyCommitters(commitTime); /* * See if anything has been written on the store since the last * commit. */ if (!_bufferStrategy.requiresCommit(store._rootBlock)) { /* * Will not do commit. * * Note: No data was written onto the store so the commit can * not achieve any useful purpose. */ return false; } /* * Explicitly call the RootBlockCommitter * * Note: This logs the current root block and set the address of * that root block in the as a root address in the commitRecord. * This is of potential use solely in disaster recovery scenarios * where your root blocks are toast, but good root blocks can be * found elsewhere in the file. Once you find a root block, you can * get the commitRecordIndex and then find earlier root blocks using * that root addr. Or you can just scan the file looking for valid * root blocks and then use the most recent one that you can find. */ rootAddrs[PREV_ROOTBLOCK] = store.m_rootBlockCommitter .handleCommit(commitTime); store.commitCounters.elapsedNotifyCommittersNanos.add(System .nanoTime() - beginNanos); // Will do commit. return true; } /** * The new root addresses for the {@link ICommitRecord}. * * @see #notifyCommitters() */ private long[] rootAddrs; /** * Write out the {@link ICommitRecord}, noting the * {@link #commitRecordAddr}, add the {@link ICommitRecord} to the * {@link CommitRecordIndex}. Finally, checkpoint the * {@link CommitRecordIndex} setting the {@link #commitRecordIndexAddr}. * <p> * Note: This is also responsible for recycling the deferred frees for * {@link IHistoryManager} backends. */ private void writeCommitRecord() { final long beginNanos = System.nanoTime(); /* * Before flushing the commitRecordIndex we need to check for * deferred frees that will prune the index. * * This is responsible for recycling the deferred frees (RWS). * * Do this BEFORE adding the new commit record since that commit * record will otherwise be immediately removed if no history is * retained. */ if (_bufferStrategy instanceof IHistoryManager) { ((IHistoryManager) _bufferStrategy) .checkDeferredFrees(store); } final ICommitRecord commitRecord = new CommitRecord(commitTime, newCommitCounter, rootAddrs); this.commitRecordAddr = store.write(ByteBuffer .wrap(CommitRecordSerializer.INSTANCE .serialize(commitRecord))); /* * Add the commit record to an index so that we can recover * historical states efficiently. */ store._commitRecordIndex.add(commitRecordAddr, commitRecord); /* * Flush the commit record index to the store and stash the address * of its metadata record in the root block. * * Note: The address of the root of the CommitRecordIndex itself * needs to go right into the root block. We are unable to place it * into the commit record since we need to serialize the commit * record, get its address, and add the entry to the * CommitRecordIndex before we can flush the CommitRecordIndex to * the store. */ commitRecordIndexAddr = store._commitRecordIndex .writeCheckpoint(); store.commitCounters.elapsedWriteCommitRecordNanos.add(System.nanoTime() - beginNanos); } /** * The address of the {@link ICommitRecord}. * * @see #writeCommitRecord() */ private long commitRecordAddr; /** * The address of the {@link CommitRecordIndex} once it has been * checkpointed against the backing store. * <p> * Note: The address of the root of the {@link CommitRecordIndex} needs * to go right into the {@link IRootBlockView}. We are unable to place * it into the {@link ICommitRecord} since we need to serialize the * {@link ICommitRecord}, get its address, and add the entry to the * {@link CommitRecordIndex} before we can flush the * {@link CommitRecordIndex} to the store. * * @see #writeCommitRecord() */ private long commitRecordIndexAddr; /** * Call commit on {@link IBufferStrategy} prior to creating the new * {@link IRootBlockView}. This will flush the {@link WriteCacheService} * . For HA, that ensures that the write set has been replicated to the * followers. * <p> * Note: required for {@link RWStore} since the metaBits allocations are * not made until commit, leading to invalid addresses for recent store * allocations. * <p> * Note: After this, we do not write anything on the backing store other * than the root block. The rest of this code is dedicated to creating a * properly formed root block. For a non-HA deployment, we just lay down * the root block. For an HA deployment, we do a 2-phase commit. * <p> * Note: In HA, the followers lay down the replicated writes * synchronously. Thus, they are guaranteed to be on local storage by * the time the leader finishes WriteCacheService.flush(). This does not * create much latency because the WriteCacheService drains the * dirtyList in a seperate thread. */ private void flushWriteSet() { final long beginNanos = System.nanoTime(); _bufferStrategy.commit(); store.commitCounters.elapsedFlushWriteSetNanos.add(System.nanoTime() - beginNanos); } /** * Create the new root block. */ private void newRootBlock() { /* * The next offset at which user data would be written. Calculated, * after commit! */ final long nextOffset = _bufferStrategy.getNextOffset(); final long blockSequence; if (_bufferStrategy instanceof IHABufferStrategy) { // always available for HA. blockSequence = ((IHABufferStrategy) _bufferStrategy) .getBlockSequence(); } else { blockSequence = old.getBlockSequence(); } /* * Update the firstCommitTime the first time a transaction commits * and the lastCommitTime each time a transaction commits (these are * commit timestamps of isolated or unisolated transactions). */ final long firstCommitTime = (old.getFirstCommitTime() == 0L ? commitTime : old.getFirstCommitTime()); final long priorCommitTime = old.getLastCommitTime(); if (priorCommitTime != 0L) { /* * This is a local sanity check to make sure that the commit * timestamps are strictly increasing. An error will be reported * if the commit time for the current (un)isolated transaction * is not strictly greater than the last commit time on the * store as read back from the current root block. */ assertPriorCommitTimeAdvances(commitTime, priorCommitTime); } final long lastCommitTime = commitTime; final long metaStartAddr = _bufferStrategy.getMetaStartAddr(); final long metaBitsAddr = _bufferStrategy.getMetaBitsAddr(); // Create the new root block. newRootBlock = new RootBlockView(!old.isRootBlock0(), old.getOffsetBits(), nextOffset, firstCommitTime, lastCommitTime, newCommitCounter, commitRecordAddr, commitRecordIndexAddr, old.getUUID(), // blockSequence, commitToken,// metaStartAddr, metaBitsAddr, old.getStoreType(), old.getCreateTime(), old.getCloseTime(), old.getVersion(), store.checker); } /** * The new {@link IRootBlockView}. * * @see #newRootBlock() */ private IRootBlockView newRootBlock; /** * Run the GATHER consensus protocol (iff HA). */ private void gatherPhase() { final long beginNanos = System.nanoTime(); /* * If not HA, do not do GATHER. */ if (!(_bufferStrategy instanceof IHABufferStrategy)) return; if (quorum == null) return; if (!quorum.isHighlyAvailable()) { // Gather and 2-phase commit are not used in HA1. return; } /** * CRITICAL SECTION. We need obtain a distributed consensus for the * services joined with the met quorum concerning the earliest * commit point that is pinned by the combination of the active * transactions and the minReleaseAge on the TXS. New transaction * starts during this critical section will block (on the leader or * the folllower) unless they are guaranteed to be allowable, e.g., * based on the current minReleaseAge, the new tx would read from * the most recent commit point, the new tx would ready from a * commit point that is already pinned by an active transaction on * that node, etc. * * Note: Lock makes this section MUTEX with awaitServiceJoin(). * * @see <a href= * "https://docs.google.com/document/d/14FO2yJFv_7uc5N0tvYboU-H6XbLEFpvu-G8RhAzvxrk/edit?pli=1#" * > HA TXS Design Document </a> * * @see <a * href="https://sourceforge.net/apps/trac/bigdata/ticket/623" * > HA TXS / TXS Bottleneck </a> */ store._gatherLock.lock(); try { // Atomic decision point for GATHER re joined services. gatherJoinedAndNonJoinedServices = new JoinedAndNonJoinedServices( quorum); // Run the GATHER protocol. consensusReleaseTime = ((AbstractHATransactionService) store .getLocalTransactionManager().getTransactionService()) .updateReleaseTimeConsensus(newCommitCounter, commitTime, gatherJoinedAndNonJoinedServices .getJoinedServiceIds(), store .getHAReleaseTimeConsensusTimeout(), TimeUnit.MILLISECONDS); } catch (Exception ex) { log.error(ex, ex); // Wrap and rethrow. throw new RuntimeException(ex); } finally { store._gatherLock.unlock(); store.commitCounters.elapsedGatherNanos.add(System.nanoTime() - beginNanos); } } /** * Set by {@link #gatherPhase()} IFF HA. */ private IJoinedAndNonJoinedServices gatherJoinedAndNonJoinedServices = null; /** * Set by {@link #gatherPhase()} IFF HA. */ private IHANotifyReleaseTimeResponse consensusReleaseTime = null; /** * Simple (non-HA) commit. */ private void commitSimple() { final long beginNanos = System.nanoTime(); /* * Force application data to stable storage _before_ * we update the root blocks. This option guarantees * that the application data is stable on the disk * before the atomic commit. Some operating systems * and/or file systems may otherwise choose an * ordered write with the consequence that the root * blocks are laid down on the disk before the * application data and a hard failure could result * in the loss of application data addressed by the * new root blocks (data loss on restart). * * Note: We do not force the file metadata to disk. * If that is done, it will be done by a force() * after we write the root block on the disk. */ if (store.doubleSync) { _bufferStrategy.force(false/* metadata */); } // write the root block on to the backing store. _bufferStrategy.writeRootBlock(newRootBlock, store.forceOnCommit); if (_bufferStrategy instanceof IRWStrategy) { /* * Now the root blocks are down we can commit any transient * state. */ ((IRWStrategy) _bufferStrategy).postCommit(); } // set the new root block. store._rootBlock = newRootBlock; // reload the commit record from the new root block. store._commitRecord = store._getCommitRecord(); if (quorum != null) { /** * Write the root block on the HALog file, closing out that * file. * * @see <a href="http://trac.blazegraph.com/ticket/721"> HA1 </a> */ final QuorumService<HAGlue> localService = quorum.getClient(); if (localService != null) { // Quorum service not asynchronously closed. try { // Write the closing root block on the HALog file. localService.logRootBlock(newRootBlock); } catch (IOException e) { throw new RuntimeException(e); } } } if (txLog.isInfoEnabled()) txLog.info("COMMIT: commitTime=" + commitTime); store.commitCounters.elapsedSimpleCommitNanos.add(System.nanoTime() - beginNanos); } /** * HA mode commit (2-phase commit). */ private void commitHA() { try { prepare2Phase(); commit2Phase(); } catch (Exception e) { // launder throwable. throw new RuntimeException(e); } } /** * PREPARE * <p> * Note: We need to make an atomic decision here regarding whether a * service is joined with the met quorum or not. This information will * be propagated through the HA 2-phase prepare message so services will * know how they must intepret the 2-phase prepare(), commit(), and * abort() requests. The atomic decision is necessary in order to * enforce a consistent role on a services that is resynchronizing and * which might vote to join the quorum and enter the quorum * asynchronously with respect to this decision point. * * TODO If necessary, we could also explicitly provide the zk version * metadata for the znode that is the parent of the joined services. * However, we would need an expanded interface to get that metadata * from zookeeper out of the Quorum. * * @throws IOException * @throws TimeoutException * @throws InterruptedException */ private void prepare2Phase() throws InterruptedException, TimeoutException, IOException { final long beginNanos = System.nanoTime(); boolean didPrepare = false; try { // Atomic decision point for joined vs non-joined services. prepareJoinedAndNonJoinedServices = new JoinedAndNonJoinedServices( quorum); prepareRequest = new PrepareRequest(// consensusReleaseTime,// gatherJoinedAndNonJoinedServices,// prepareJoinedAndNonJoinedServices,// newRootBlock,// quorumService.getPrepareTimeout(), // timeout TimeUnit.MILLISECONDS// ); // issue prepare request. prepareResponse = quorumService.prepare2Phase(prepareRequest); if (haLog.isInfoEnabled()) haLog.info(prepareResponse.toString()); if (!prepareResponse.willCommit()) { // PREPARE rejected. throw new QuorumException("PREPARE rejected: nyes=" + prepareResponse.getYesCount() + ", replicationFactor=" + prepareResponse.replicationFactor()); } didPrepare = true; } finally { if (!didPrepare) { /* * Something went wrong. Any services that were in the * pipeline could have a dirty write set. Services that * voted NO will have already discarded their dirty write * set. We issue an abort2Phase() to tell the other services * to discard the dirty write set as well. * * TODO We only need to issue the 2-phase abort against * those services that (a) were joined with the met quorum; * and (b) voted YES in response to the PREPARE message (any * service that voted NO already discarded its dirty write * set). * * TODO The service should only do the 2-phase abort if the * commitToken and commitCounter are valid. If the quorum * breaks, then the services will move into the Error state * and will do a local abort as part of that transition. */ try { quorumService.abort2Phase(commitToken); } catch (Throwable t) { log.warn(t, t); } } store.commitCounters.elapsedPrepare2PhaseNanos.add(System .nanoTime() - beginNanos); } } // Fields set by the method above. private IJoinedAndNonJoinedServices prepareJoinedAndNonJoinedServices; private PrepareRequest prepareRequest; private PrepareResponse prepareResponse; /** * COMMIT. * * Pre-condition: PREPARE was successful on a majority of the services. */ private void commit2Phase() throws Exception { final long beginNanos = System.nanoTime(); boolean didCommit = false; try { /* * Prepare was successful. COMMIT message has been formed. We * will now commit. * * Note: The overall commit will fail unless we can prove that a * majority of the services successfully committed. */ commitRequest = new CommitRequest(prepareRequest, prepareResponse); commitResponse = quorumService.commit2Phase(commitRequest); if (!store.quorum.isQuorum(commitResponse.getNOk())) { /* * Fail the commit. * * Note: An insufficient number of services were able to * COMMIT successfully. * * Note: It is possible that a commit could be failed here * when the commit is in fact stable on a majority of * services. For example, with k=3 and 2 services running if * both of them correctly update their root blocks but we * lose network connectivity to the follower before the RMI * returns, then we will fail the commit. */ // Note: Guaranteed to not return normally! commitResponse.throwCauses(); } didCommit = true; } finally { if (!didCommit) { /* * The quorum voted to commit, but something went wrong. * * This forces the leader to fail over. The quorum can then * meet up again around a new consensus. * * Note: It is possible that a new consensus can not be * formed. The 2-phase commit protocol does not handle some * cases of compound failure. For example, consider the case * where the HA cluster is running with bare majority of * services. All services that are active vote YES, but one * of those services fails before processing the COMMIT * message. The quorum will not meet again unless a new * consensus can be formed from the services that remain up * and the services that were not running. The services that * were not running will be at an earlier commit point so * they can not form a consensus with the services that * remain up. Further, there can not be a majority among the * services that were not running (except in the edge case * where the failed commit was the first commit since the * last commit on one of the services that was down at the * start of that failed commit). Situations of this type * require operator intervention. E.g., explicitly rollback * the database or copy HALog files from one machine to * another such that it will apply those HALog files on * restart and form a consensus with the other services. */ quorumService.enterErrorState(); } store.commitCounters.elapsedCommit2PhaseNanos.add(System .nanoTime() - beginNanos); } } // Fields set by the method above. private CommitRequest commitRequest; private CommitResponse commitResponse; } // class CommitState. /** * An atomic commit is performed by directing each registered * {@link ICommitter} to flush its state onto the store using * {@link ICommitter#handleCommit(long)}. The address returned by that * method is the address from which the {@link ICommitter} may be reloaded * (and its previous address if its state has not changed). That address is * saved in the {@link ICommitRecord} under the index for which that * committer was {@link #registerCommitter(int, ICommitter) registered}. We * then force the data to stable store, update the root block, and force the * root block and the file metadata to stable store. * <p> * Note: Each invocation of this method MUST use a distinct * <i>commitTime</i> and the commitTimes MUST be monotonically increasing. * These guarantees support both the database version history mechanisms and * the High Availability mechanisms. * * @param commitTime * The commit time either of a transaction or of an unisolated * commit. Note that when mixing isolated and unisolated commits * you MUST use the same {@link ITimestampService} for both * purposes. * * @return The timestamp assigned to the commit record -or- 0L if there were * no data to commit. */ // Note: Overridden by StoreManager (DataService). protected long commitNow(final long commitTime) { final long beginNanos = System.nanoTime(); final WriteLock lock = _fieldReadWriteLock.writeLock(); lock.lock(); try { assertOpen(); if (log.isInfoEnabled()) log.info("commitTime=" + commitTime); // Critical Section Check. @see #1021 (Add critical section protection to AbstractJournal.abort() and BigdataSailConnection.rollback()) if (abortRequired.get()) throw new AbortRequiredException(); final CommitState cs = new CommitState(this, commitTime); /* * Flush application data, decide whether or not the store is dirty, * and return immediately if it is not dirty. */ if (!cs.notifyCommitters()) { if (log.isInfoEnabled()) log.info("Nothing to commit"); return 0L; } // Do GATHER (iff HA). cs.gatherPhase(); /* * Flush deferred frees (iff RWS), write the commit record onto the * store, and write the commit record index onto the store. */ cs.writeCommitRecord(); if (quorum != null) { /* * Verify that the last negotiated quorum is still valid. */ quorum.assertLeader(cs.commitToken); } /* * Conditionally obtain a lock that will protect the * commit()/postCommit() protocol. */ // final long nextOffset; final Lock commitLock; if (_bufferStrategy instanceof IRWStrategy) { commitLock = ((IRWStrategy) _bufferStrategy).getCommitLock(); } else { commitLock = null; } if (commitLock != null) { // Take the commit lock. commitLock.lock(); } try { // Flush writes to the backing store / followers. cs.flushWriteSet(); // Prepare the new root block. cs.newRootBlock(); if (quorum == null || quorum.replicationFactor() == 1) { // Non-HA mode (including HA1). cs.commitSimple(); } else { // HA mode commit (2-phase commit). cs.commitHA(); } // else HA mode } finally { if (commitLock != null) { /* * Release the [commitLock] iff one was taken above. */ commitLock.unlock(); } } final long elapsedNanos = System.nanoTime() - cs.beginNanos; if (BigdataStatics.debug || log.isInfoEnabled()) { final String msg = "commit: commitTime=" + cs.commitTime + ", commitCounter=" + cs.newCommitCounter + ", latency=" + TimeUnit.NANOSECONDS.toMillis(elapsedNanos); // + ", nextOffset=" // + cs.newRootBlock.getNextOffset() // + ", byteCount=" // + (cs.newRootBlock.getNextOffset() - cs.byteCountBefore); if (BigdataStatics.debug) System.err.println(msg); else if (log.isInfoEnabled()) log.info(msg); // if (BigdataStatics.debug && LRUNexus.INSTANCE != null) { // System.err.println(LRUNexus.INSTANCE.toString()); // } } return cs.commitTime; } finally { lock.unlock(); commitCounters.elapsedTotalCommitNanos.add(System.nanoTime() - beginNanos); } } // /** // * (debug only) For the {@link RWStrategy}, scans the // * {@link #historicalIndexCache} and verifies that there are no checkpoint // * addresses present which are "locked". // */ // private boolean assertHistoricalIndexCacheIsClean() { // // if (true) return true; // disable // // if (!(getBufferStrategy() instanceof RWStrategy)) // return true; // // final RWStrategy bufferStrategy = (RWStrategy) getBufferStrategy(); // // final Iterator<Map.Entry<Long, WeakReference<BTree>>> itr = historicalIndexCache // .entryIterator(); // // while (itr.hasNext()) { // // final Map.Entry<Long, WeakReference<BTree>> e = itr.next(); // // bufferStrategy.assertNotLocked(e.getKey()); // // final BTree btree = e.getValue().get(); // // if (btree != null) { // // bufferStrategy.assertNotLocked(btree.getCheckpoint() // .getCheckpointAddr()); // } // // } // // return true; // // } /** * Method verifies that the commit time strictly advances on the local store * by checking against the current root block. * * @param commitTime * The proposed commit time. * * @throws IllegalArgumentException * if the <i>commitTime</i> is LTE the value reported by * {@link IRootBlockView#getLastCommitTime()}. */ protected void assertCommitTimeAdvances(final long commitTime) { if (commitTime <= _rootBlock.getLastCommitTime()) { /* * The commit times must strictly advance. */ throw new IllegalArgumentException(); } } /** * Method verifies that the commit time strictly advances on the local store * by checking against the current root block. * * @param currentCommitTime * @param priorCommitTime * * @throws IllegalArgumentException * if the <i>commitTime</i> is LTE the value reported by * {@link IRootBlockView#getLastCommitTime()}. */ static protected void assertPriorCommitTimeAdvances( final long currentCommitTime, final long priorCommitTime) { if (currentCommitTime <= priorCommitTime) { throw new RuntimeException("Time goes backwards: commitTime=" + currentCommitTime + ", but lastCommitTime=" + priorCommitTime + " on the current root block"); } } @Override public void force(final boolean metadata) { assertOpen(); _bufferStrategy.force(metadata); } @Override public long size() { return _bufferStrategy.size(); } @Override public ByteBuffer read(final long addr) { assertOpen(); assertCanRead(); return _bufferStrategy.read(addr); } @Override public long write(final ByteBuffer data) { assertCanWrite(); return _bufferStrategy.write(data); } @Override public long write(final ByteBuffer data, final IAllocationContext context) { assertCanWrite(); if (_bufferStrategy instanceof IRWStrategy) { return ((IRWStrategy) _bufferStrategy).write(data, context); } else { return _bufferStrategy.write(data); } } @Override public IPSOutputStream getOutputStream() { assertCanWrite(); return _bufferStrategy.getOutputStream(); } @Override public IPSOutputStream getOutputStream(final IAllocationContext context) { assertCanWrite(); if (_bufferStrategy instanceof IRWStrategy) { return ((IRWStrategy) _bufferStrategy).getOutputStream(context); } else { return _bufferStrategy.getOutputStream(); } } @Override public InputStream getInputStream(long addr) { return _bufferStrategy.getInputStream(addr); } // Note: NOP for WORM. Used by RW for eventual recycle protocol. @Override public void delete(final long addr) { assertCanWrite(); _bufferStrategy.delete(addr); } @Override public void delete(final long addr, final IAllocationContext context) { assertCanWrite(); if(_bufferStrategy instanceof IRWStrategy) { ((IRWStrategy) _bufferStrategy).delete(addr, context); } else { _bufferStrategy.delete(addr); } } @Override public void detachContext(final IAllocationContext context) { assertCanWrite(); if(_bufferStrategy instanceof IRWStrategy) { ((IRWStrategy) _bufferStrategy).detachContext(context); } } @Override public void abortContext(final IAllocationContext context) { assertCanWrite(); if(_bufferStrategy instanceof IRWStrategy) { ((IRWStrategy) _bufferStrategy).abortContext(context); } } // @Override // public void registerContext(final IAllocationContext context) { // // assertCanWrite(); // // if(_bufferStrategy instanceof IRWStrategy) { // // ((IRWStrategy) _bufferStrategy).registerContext(context); // // } // // } @Override final public long getRootAddr(final int index) { final ReadLock lock = _fieldReadWriteLock.readLock(); lock.lock(); try { assertOpen(); final ICommitRecord commitRecord = _commitRecord; if (commitRecord == null) throw new AssertionError(); return commitRecord.getRootAddr(index); } finally { lock.unlock(); } } /** * Resolve the {@link ICommitRecord} for the earliest visible commit point * based on the caller's <i>releaseTime</i>. * <p> * Note: This method is used for HA. The caller provides a releaseTime based * on the readsOnCommitTime of the earliestActiveTx and the minReleaseAge * rather than {@link ITransactionService#getReleaseTime()} since the latter * is only updated by the release time consensus protocol during a 2-phase * commit. */ protected ICommitRecord getEarliestVisibleCommitRecordForHA( final long releaseTime) { final ReadLock lock = _fieldReadWriteLock.readLock(); lock.lock(); try { final long commitCounter = _rootBlock.getCommitCounter(); final long lastCommitTime = _rootBlock.getLastCommitTime(); if (commitCounter == 0L) { if (log.isTraceEnabled()) log.trace("No commit points"); // Nothing committed yet. return null; } if (releaseTime >= lastCommitTime) { /* * The caller is querying with an effective releaseTime GTE the * lastCommitTime. It is not valid to have a releaseTime GTE the * current committed state. */ throw new IllegalArgumentException("releaseTime(" + releaseTime + ") >= lastCommitTime(" + lastCommitTime + ")"); } final CommitRecordIndex commitRecordIndex = _commitRecordIndex; if (commitRecordIndex == null) throw new AssertionError(); /* * Note: The commitRecordIndex does not allow us to probe with a * commitTime of ZERO. Therefore, when the releaseTime is ZERO, we * probe with a commitTime of ONE. Since the commitTimes are * timestamps, there will never be a record with a commitTime of ONE * and this will return us the first record in the * CommitRecordIndex. */ final long effectiveTimestamp = releaseTime == 0L ? 1 : releaseTime; final ICommitRecord commitRecord = commitRecordIndex .findNext(effectiveTimestamp); if (commitRecord == null) throw new AssertionError("commitCounter=" + commitCounter + " but no commitRecord for releaseTime=" + releaseTime + ", effectiveTimestamp=" + effectiveTimestamp + " :: " + commitRecordIndex); if (log.isTraceEnabled()) log.trace("releaseTime=" + releaseTime + ",commitRecord=" + commitRecord); return commitRecord; // } catch (IOException e) { // // // Note: Should not be thrown. Local method call. // throw new RuntimeException(e); } finally { lock.unlock(); } } /** * Returns a read-only view of the most recently committed * {@link ICommitRecord} containing the root addresses. * * @return The current {@link ICommitRecord} and never <code>null</code>. */ public ICommitRecord getCommitRecord() { final ReadLock lock = _fieldReadWriteLock.readLock(); lock.lock(); try { assertOpen(); final ICommitRecord commitRecord = _commitRecord; if (commitRecord == null) throw new AssertionError(); return commitRecord; } finally { lock.unlock(); } } /** * Return the commit record, either new or read from the root block. */ private ICommitRecord _getCommitRecord() { assert _fieldReadWriteLock.writeLock().isHeldByCurrentThread(); // the address of the current commit record from the root block. final long commitRecordAddr = _rootBlock.getCommitRecordAddr(); if (log.isInfoEnabled()) log.info("Reading commit record from: " + commitRecordAddr); if (commitRecordAddr == NULL) { // No commit record on the store yet. return new CommitRecord(); } else { // Read the commit record from the store. return CommitRecordSerializer.INSTANCE.deserialize(_bufferStrategy .read(commitRecordAddr)); } } /** * This method is invoked to mark any persistence capable data structures * as invalid (in an error state). This ensures that dirty committers are * not accidentally flushed through after a call to abort(). * * @see https://jira.blazegraph.com/browse/BLZG-1953 */ protected void invalidateCommitters() { if(log.isDebugEnabled()) log.debug("invalidating commiters for: " + this + ", lastCommitTime: " + this.getLastCommitTime()); assert _fieldReadWriteLock.writeLock().isHeldByCurrentThread(); final Throwable t = new StackInfoReport("ABORT journal " + this + ", lastCommitTime: " + this.getLastCommitTime()); for (ICommitter committer : _committers) { if (committer != null) committer.invalidate(t); } // @see BLZG-2023, BLZG-2041. Discard unisolated views from the resource locator cache. getResourceLocator().clearUnisolatedCache(); } /** * This method is invoked by {@link #abort()} when the store must discard * any hard references that it may be holding to objects registered as * {@link ICommitter}s. * <p> * The default implementation discards the btree mapping names to named * btrees. * <p> * Subclasses MAY extend this method to discard their own committers but * MUST NOT override it completely. */ protected void discardCommitters() { assert _fieldReadWriteLock.writeLock().isHeldByCurrentThread(); // discard. _name2Addr = null; } /** * Invoked when a journal is first created, re-opened, or when the * committers have been {@link #discardCommitters() discarded}. * <p> * The basic implementation sets up the btree that is responsible for * resolving named btrees. * <p> * Subclasses may extend this method to setup their own committers. */ protected void setupCommitters() { assert _fieldReadWriteLock.writeLock().isHeldByCurrentThread(); if (!isReadOnly()) { /* * Only the leader can accept writes so only the leader will * register the Name2Addr object. Followers can access the * historical Name2Addr objects from the CommitRecordIndex for * historical commit points, but not the live Name2Addr object, * which is only on the leader. */ setupName2AddrBTree(getRootAddr(ROOT_NAME2ADDR)); /** * Do not register committer to write previous root block, but * instead just create it and call explicitly when required. This * is a workaround to allow "void" transactions. */ m_rootBlockCommitter = new RootBlockCommitter(this); /** * If the strategy is a RWStrategy, then register the delete * block committer to store the deferred deletes for each * commit record. */ if (_bufferStrategy instanceof IRWStrategy) setCommitter(DELETEBLOCK, new DeleteBlockCommitter((IRWStrategy) _bufferStrategy)); /* * Responsible for writing the ICUVersionRecord exactly once onto * the backing store, e.g., when the store is created or when it is * open with the "update" option specified for ICU. */ setCommitter(ROOT_ICUVERSION, new ICUVersionCommitter()); } } /** * Return the {@link ICUVersionRecord} from the current * {@link ICommitRecord} -or- a new instance for the current runtime * environment if the root address for {@link #ROOT_ICUVERSION} is * {@link #NULL}. */ private ICUVersionRecord _getICUVersionRecord() { assert _fieldReadWriteLock.writeLock().isHeldByCurrentThread(); final long addr = getRootAddr(ROOT_ICUVERSION); final ICUVersionRecord r; if (addr == NULL) { // New instance for the current runtime environment. r = ICUVersionRecord.newInstance(); } else { // Existing instance from the store. r = (ICUVersionRecord) SerializerUtil.deserialize(read(addr)); } return r; } /** * Writes the {@link ICUVersionRecord} onto the store iff either (a) it does * not exist; or (b) it exists, it differs from the last persistent record, * and the update flag was specified. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan * Thompson</a> * * @see Options#UPDATE_ICU_VERSION */ private class ICUVersionCommitter implements ICommitter { private boolean update; private long lastAddr; private volatile Throwable error = null; private ICUVersionCommitter() { // the "update" option. update = Boolean.valueOf(properties.getProperty( Options.UPDATE_ICU_VERSION, "false")); // lookup the address of the ICU version record (may be NULL). lastAddr = getRootAddr(ROOT_ICUVERSION); } /** * Commits a new {@link ICUVersionRecord} IF none is defined -OR- IF one * is defined, it is a different version of ICU, and the update flag is * set. */ @Override public long handleCommit(final long commitTime) { if (error != null) throw new IndexInconsistentError(error); if(!update && lastAddr != NULL) { // Nothing changed. return lastAddr; } /* * Note: The Journal only validates the persistent ICU version * record in its constructor. By the time the code reaches this * point, it is either in agreement or will be written. */ final ICUVersionRecord r = ICUVersionRecord.newInstance(); if (lastAddr == NULL || !(r.equals(_icuVersionRecord) && update)) { if (_icuVersionRecord != null && update) log.warn("Updating ICUVersion: old=" + _icuVersionRecord + ", new=" + r); // do not update next time. update = false; // write ICU version record onto the store. lastAddr = write(ByteBuffer.wrap(SerializerUtil.serialize(r))); // return address of the ICU version record. return lastAddr; } // Nothing changed. return lastAddr; } @Override public void invalidate(final Throwable t) { if (t == null) throw new IllegalArgumentException(); if (error == null) error = t; } } /* * named indices. */ /** * Setup the btree that resolves named indices. This is invoke when the * journal is opened and by {@link #abort()} . * * @param addr * The root address of the btree -or- 0L iff the btree has not * been defined yet. * * @see Options#LIVE_INDEX_CACHE_CAPACITY */ Name2Addr setupName2AddrBTree(final long addr) { assert _fieldReadWriteLock.writeLock().isHeldByCurrentThread(); assert _name2Addr == null; if (addr == 0L) { /* * Create btree mapping names to addresses. * * The btree has either never been created or if it had been created * then the store was never committed and the btree had since been * discarded. In any case we create a new btree now. * * Note: if the journal is read-only the we create the commit record * index on an in-memory store in order to avoid triggering a write * exception on the journal. */ if (log.isInfoEnabled()) log.info("New " + Name2Addr.class.getName()); _name2Addr = Name2Addr.create((isReadOnly() ? new SimpleMemoryRawStore() : this)); } else { /* * Reload the mutable btree from its checkpoint address. * * Note: This is the live view of the B+Tree. In this specific case * we DO NOT use the canonicalizing mapping since we do not want * anyone else to have access to this same instance of the B+Tree. */ if (log.isInfoEnabled()) log.info("Loading " + Name2Addr.class.getName() + " from " + addr); _name2Addr = (Name2Addr) BTree.load(this, addr, false/* readOnly */); } _name2Addr.setupCache(liveIndexCacheCapacity, liveIndexCacheTimeout); // register for commit notices. setCommitter(ROOT_NAME2ADDR, _name2Addr); return _name2Addr; } /** * Return a read-only view of the last committed state of the * {@link CommitRecordIndex}. * * @return The read-only view of the {@link CommitRecordIndex}. */ public CommitRecordIndex getReadOnlyCommitRecordIndex() { final ReadLock lock = _fieldReadWriteLock.readLock(); lock.lock(); try { assertOpen(); final CommitRecordIndex commitRecordIndex = getCommitRecordIndex( _rootBlock.getCommitRecordIndexAddr(), true/* readOnly */); // return new ReadOnlyIndex(commitRecordIndex); return commitRecordIndex; } finally { lock.unlock(); } } // /** // * I have removed this method since the returned {@link CommitRecordIndex} // * was being used without appropriate synchronization. There is a // * {@link #getReadOnlyCommitRecordIndex()} which may be used in place of // * this method. // */ // protected CommitRecordIndex getCommitRecordIndex() { // // final ReadLock lock = _fieldReadWriteLock.readLock(); // // lock.lock(); // // try { // // assertOpen(); // // final long commitRecordIndexAddr = _rootBlock.getCommitRecordIndexAddr(); // // final CommitRecordIndex commitRecordIndex = getCommitRecordIndex(addr); // // if (commitRecordIndex == null) // throw new AssertionError(); // // return commitRecordIndex; // // } finally { // // lock.unlock(); // // } // // } /** * Read and return the {@link CommitRecordIndex} from the current root * block. * * @return The {@link CommitRecordIndex} and never <code>null</code>. */ private CommitRecordIndex _getCommitRecordIndex() { assert _fieldReadWriteLock.writeLock().isHeldByCurrentThread(); assert _rootBlock != null; final long addr = _rootBlock.getCommitRecordIndexAddr(); try { if (log.isDebugEnabled()) log.debug("Loading from addr=" + addr); // Load the live index from the disk. return getCommitRecordIndex(addr, false/* readOnly */); } catch (RuntimeException ex) { /* * Log the root block for post-mortem. */ log.fatal("Could not read the commit record index:\n" + _rootBlock, ex); throw ex; } } /** * Create or load and return the index that resolves timestamps to * {@link ICommitRecord}s. This method is capable of returning either the * live {@link CommitRecordIndex} or a read-only view of any committed * version of that index. * * <strong>CAUTION: DO NOT EXPOSE THE LIVE COMMIT RECORD INDEX OUTSIDE OF * THIS CLASS. IT IS NOT POSSIBLE TO HAVE CORRECT SYNCHRONIZATION ON THAT * INDEX IN ANOTHER CLASS.</code> * * @param addr * The root address of the index -or- 0L if the index has not * been created yet. When addr is non-{@link #NULL}, each * invocation will return a distinct {@link CommitRecordIndex} * object. * * @param readOnly * When <code>false</code> the returned is NOT cached. * * @return The {@link CommitRecordIndex} for that address or a new index if * <code>0L</code> was specified as the address. * * @see #_commitRecordIndex */ protected CommitRecordIndex getCommitRecordIndex(final long addr, final boolean readOnly) { if (log.isInfoEnabled()) log.info("addr=" + toString(addr)); final CommitRecordIndex ndx; if (addr == NULL) { /* * The btree has either never been created or if it had been created * then the store was never committed and the btree had since been * discarded. In any case we create a new btree now. * * Note: if the journal is read-only then we create the commit * record index on an in-memory store in order to avoid triggering a * write exception on the journal. * * Note: if the journal is not the quorum leader then it is * effectively read-only. */ if (isReadOnly() || readOnly) { ndx = CommitRecordIndex.createTransient(); } else { ndx = CommitRecordIndex.create(this); } } else { if (readOnly) { /* * Read only view of the most CommitRecordIndex having * that checkpointAddr. */ ndx = (CommitRecordIndex) getIndexWithCheckpointAddr(addr); } else { /* * Reload the mutable btree from its root address. * * Note: For this code path we DO NOT cache the index view. */ ndx = (CommitRecordIndex) BTree.load(this, addr, false/* readOnly */); } } assert ndx != null; return ndx; } /** * Note: There are some bigdata releases (such as 1.0.4) where the commit * record index was not pruned when deferred deletes were recycled. By * maintaining this test, we will correctly refuse to return a commit record * for a commit point whose deferred deletes have been recycled, even when * the commit record is still present in the commit record index. * * @see https://sourceforge.net/apps/trac/bigdata/ticket/480 */ private boolean isHistoryGone(final long commitTime) { if (this._bufferStrategy instanceof IHistoryManager) { final long lastReleaseTime = ((IHistoryManager) _bufferStrategy) .getLastReleaseTime(); if (commitTime <= lastReleaseTime) { if (log.isDebugEnabled()) log.info("History gone: commitTime=" + commitTime); return true; // no index available } } return false; } /** * {@inheritDoc} * * @todo the {@link CommitRecordIndex} is a possible source of thread * contention since transactions need to use this code path in order * to locate named indices but the {@link WriteExecutorService} can * also write on this index. I have tried some different approaches to * handling this. */ @Override public ICommitRecord getCommitRecord(final long commitTime) { if (isHistoryGone(commitTime)) return null; final ReadLock lock = _fieldReadWriteLock.readLock(); lock.lock(); try { assertOpen(); final CommitRecordIndex commitRecordIndex = _commitRecordIndex; if (commitRecordIndex == null) throw new AssertionError(); return commitRecordIndex.find(commitTime); } finally { lock.unlock(); } } /** * Return the first commit record whose timestamp is strictly greater than * the given commitTime. * * @param commitTime * The commit time. * * @return The commit record -or- <code>null</code> if there is no commit * record whose timestamp is strictly greater than * <i>commitTime</i>. */ public ICommitRecord getCommitRecordStrictlyGreaterThan(final long commitTime) { final ReadLock lock = _fieldReadWriteLock.readLock(); lock.lock(); try { assertOpen(); final CommitRecordIndex commitRecordIndex = _commitRecordIndex; if (commitRecordIndex == null) throw new AssertionError(); return commitRecordIndex.findNext(commitTime); } finally { lock.unlock(); } } /** * {@inheritDoc} * <p> * Note: Transactions should pass in the timestamp against which they are * reading rather than the transaction identifier (aka startTime). By * providing the timestamp of the commit point, the transaction will hit the * {@link #indexCache}. If the transaction passes the startTime instead, * then all startTimes will be different and the cache will be defeated. * * @throws UnsupportedOperationException * If you pass in {@link ITx#UNISOLATED}, * {@link ITx#READ_COMMITTED}, or a timestamp that corresponds * to a read-write transaction since those are not "commit * times". * * @see #indexCache * @see <a href="http://sourceforge.net/apps/trac/bigdata/ticket/546" > Add * cache for access to historical index views on the Journal by name * and commitTime. </a> * * FIXME GIST Reconcile API tension with {@link IIndex} and * {@link ICheckpointProtocol}, however this method is overridden by * {@link Journal} and is also implemented by * {@link IBigdataFederation}. The central remaining tensions are * {@link FusedView} and the local/remote aspect. {@link FusedView} * could probably be "fixed" by extending {@link AbstractBTree} rather * than having an inner delegate for the mutable view. The local/remote * issue is more complex. */ @Override public IIndex getIndex(final String name, final long commitTime) { return (BTree) getIndexLocal(name, commitTime); } /** * Core implementation for access to historical index views. * <p> * Note: Transactions should pass in the timestamp against which they are * reading rather than the transaction identifier (aka startTime). By * providing the timestamp of the commit point, the transaction will hit the * {@link #indexCache}. If the transaction passes the startTime instead, * then all startTimes will be different and the cache will be defeated. * * @throws UnsupportedOperationException * If you pass in {@link ITx#UNISOLATED}, * {@link ITx#READ_COMMITTED}, or a timestamp that corresponds * to a read-write transaction since those are not "commit * times". * * @see #indexCache * @see #getIndex(String, long) * @see <a href="http://sourceforge.net/apps/trac/bigdata/ticket/546" > Add * cache for access to historical index views on the Journal by name * and commitTime. </a> */ @Override final public ICheckpointProtocol getIndexLocal(final String name, final long commitTime) { if (commitTime == ITx.UNISOLATED || commitTime == ITx.READ_COMMITTED || TimestampUtility.isReadWriteTx(commitTime)) { throw new UnsupportedOperationException("name=" + name + ",commitTime=" + TimestampUtility.toString(commitTime)); } ICheckpointProtocol ndx = null; /* * Form the key for the cache. * * Note: In order to avoid cluttering the cache, a read-only or * read/write transaction MUST pass the timestamp of the actual commit * point against which it is reading into this method. If it passes in * abs(txId) instead, then the cache will be cluttered since each tx * will have a distinct key for the same index against the same commit * point. */ final NT nt = new NT(name, commitTime); // Test the cache. ndx = indexCache.get(nt); if (ndx != null) { if (isHistoryGone(commitTime)) { if (log.isTraceEnabled()) log.trace("Removing entry from cache: " + name); /* * No longer visible. * * Note: If you are using a transaction, then the transaction * will have a read lock which prevents the commit point against * which it is reading from being released. Thus, a transaction * can not hit this code path. However, it can be hit by * historical reads which are not protected by a transaction. */ indexCache.remove(nt); return null; } // Cache hit. return ndx; } /* * Cache miss. */ final ReadLock lock = _fieldReadWriteLock.readLock(); lock.lock(); try { assertOpen(); // Resolve the commit record. final ICommitRecord commitRecord = getCommitRecord(commitTime); if (commitRecord == null) { // if (log.isInfoEnabled()) log.warn("No commit record: name=" + name + ", timestamp=" + commitTime); return null; } // Resolve the index against that commit record. ndx = (ICheckpointProtocol) getIndexWithCommitRecord(name, commitRecord); if (ndx == null) { // Not found return null; } // Add the index to the cache. final ICheckpointProtocol ndx2 = indexCache.putIfAbsent(nt, ndx); if (ndx2 != null) { /* * Lost a data race. Use the winner's version of the index. * * Note: Both index objects SHOULD be the same reference. * getIndex(name,commitRecord) will go through a canonicalizing * mapping to ensure that. */ ndx = ndx2; } // Found it and cached it. return ndx; } finally { lock.unlock(); } } /** * The size of the cache from (name,timestamp) to {@link IIndex}. */ protected int getIndexCacheSize() { return indexCache.size(); } /** * The size of the canonicalizing cache from addr to {@link IIndex}. */ protected int getHistoricalIndexCacheSize() { return historicalIndexCache.size(); } // /** // * Returns a read-only named index loaded from a {@link ICommitRecord}. The // * {@link BTree} will be marked as read-only, it will NOT permit writes, and // * {@link BTree#getLastCommitTime(long)} will report the value associated // * with {@link Entry#commitTime} for the historical {@link Name2Addr} // * instance for that {@link ICommitRecord}. // * // * @return The named index -or- <code>null</code> iff the named index did // * not exist as of that commit record. // * // * @deprecated by {@link #getIndexWithCommitRecord(String, ICommitRecord)} // */ // public BTree getIndex(final String name, final ICommitRecord commitRecord) { // // return (BTree) getIndexWithCommitRecord(name, commitRecord); // // } /** * Returns a read-only named index loaded from a {@link ICommitRecord}. The * index will be marked as read-only, it will NOT permit writes, and * {@link ICheckpointProtocol#getLastCommitTime(long)} will report the value * associated with {@link Entry#commitTime} for the historical * {@link Name2Addr} instance for that {@link ICommitRecord}. * <p> * Note: This method should be preferred to * {@link #getIndexWithCheckpointAddr(long)} for read-historical indices * since it will explicitly mark the index as read-only and specifies the * <i>lastCommitTime</i> on the returned index based on * {@link Name2Addr.Entry#commitTime}, which is the actual commit time for * the last update to the index. * * @return The named index -or- <code>null</code> iff the named index did * not exist as of that commit record. */ final public ICheckpointProtocol getIndexWithCommitRecord( final String name, final ICommitRecord commitRecord) { if (name == null) throw new IllegalArgumentException(); if (commitRecord == null) throw new IllegalArgumentException(); final ReadLock lock = _fieldReadWriteLock.readLock(); lock.lock(); try { assertOpen(); /* * The address of an historical Name2Addr mapping used to resolve * named indices for the historical state associated with this * commit record. */ final long checkpointAddr = commitRecord .getRootAddr(ROOT_NAME2ADDR); if (checkpointAddr == 0L) { log.warn("No name2addr entry in this commit record: " + commitRecord); return null; } /* * Resolve the address of the historical Name2Addr object using the * canonicalizing object cache. This prevents multiple historical * Name2Addr objects springing into existence for the same commit * record. */ final Name2Addr name2Addr = (Name2Addr) getIndexWithCheckpointAddr(checkpointAddr); /* * The address at which the named index was written for that * historical state. */ final Name2Addr.Entry entry = name2Addr.getEntry(name); if (entry == null) { // No such index by name for that historical state. return null; } /* * Resolve the named index using the object cache to impose a * canonicalizing mapping on the historical named indices based on * the address on which it was written in the store. */ final ICheckpointProtocol index = getIndexWithCheckpointAddr(entry.checkpointAddr); assert entry.commitTime != 0L : "Entry=" + entry; // Set the last commit time on the btree. index.setLastCommitTime(entry.commitTime); return index; } finally { lock.unlock(); } } // /** // * A canonicalizing mapping for <em>historical</em> {@link BTree}s. // * <p> // * Note: This method imposes a canonicalizing mapping and ensures that there // * will be at most one instance of the historical index at a time. This // * guarentee is used to facilitate buffer management. Writes on indices // * returned by this method are NOT allowed. // * <p> // * Note: This method marks the {@link BTree} as read-only but does not set // * {@link BTree#setLastCommitTime(long)} since it does not have access to // * the {@link Entry#commitTime}, only the {@link BTree}s checkpointAddr and // * {@link Checkpoint} record. See {@link #getIndex(String, ICommitRecord)} // * which does set {@link BTree#setLastCommitTime(long)}. // * <p> // * Note: The canonicalizing mapping for unisolated {@link BTree}s is // * maintained by the {@link ITx#UNISOLATED} {@link Name2Addr} instance. // * // * @param checkpointAddr // * The address of the {@link Checkpoint} record for the // * {@link BTree}. // * // * @return The {@link BTree} loaded from that {@link Checkpoint}. // * // * @see Options#HISTORICAL_INDEX_CACHE_CAPACITY // * // * @deprecated by {@link #getIndexWithCheckpointAddr(long)} // */ // final public BTree getIndex(final long checkpointAddr) { // // return (BTree) getIndexWithCheckpointAddr(checkpointAddr); // // } // /** // * A canonicalizing mapping for <em>historical</em> {@link HTree}s. // * <p> // * Note: This method imposes a canonicalizing mapping and ensures that there // * will be at most one instance of the historical index at a time. This // * guarentee is used to facilitate buffer management. Writes on indices // * returned by this method are NOT allowed. // * <p> // * Note: This method marks the {@link BTree} as read-only but does not set // * {@link BTree#setLastCommitTime(long)} since it does not have access to // * the {@link Entry#commitTime}, only the {@link BTree}s checkpointAddr and // * {@link Checkpoint} record. See {@link #getIndex(String, ICommitRecord)} // * which does set {@link BTree#setLastCommitTime(long)}. // * <p> // * Note: The canonicalizing mapping for unisolated {@link BTree}s is // * maintained by the {@link ITx#UNISOLATED} {@link Name2Addr} instance. // * // * @param checkpointAddr // * The address of the {@link Checkpoint} record for the // * {@link HTree}. // * // * @return The {@link HTree} loaded from that {@link Checkpoint}. // * // * @see Options#HISTORICAL_INDEX_CACHE_CAPACITY // */ // final public HTree getHTree(final long checkpointAddr) { // // return (HTree) getIndexWithCheckpointAddr(checkpointAddr); // // } /** * A canonicalizing mapping for <em>historical</em> (read-only) views of * persistence capable data structures (core impl). * <p> * Note: This method imposes a canonicalizing mapping and ensures that there * will be at most one object providing a view of the historical data * structure as of the specified timestamp. This guarentee is used to * facilitate buffer management. * <p> * Note: The canonicalizing mapping for unisolated views of persistence * capable data structures is maintained by the {@link ITx#UNISOLATED} * {@link Name2Addr} instance. * * @param checkpointAddr * The address of the {@link Checkpoint} record. * * @return The read-only persistence capable data structure associated with * that {@link Checkpoint}. * * @see Options#HISTORICAL_INDEX_CACHE_CAPACITY */ public final ICheckpointProtocol getIndexWithCheckpointAddr( final long checkpointAddr) { /* * Note: There are potentially three IO operations here. Reading the * Checkpoint record, reading the IndexMetadata record, then reading the * root node/leaf of the BTree. * * Note: We use putIfAbsent() here rather than the [synchronized] * keyword for higher concurrency with atomic semantics. * * DO NOT use the checkpointAddr but rather the physical address * without the length, this will enable the RWStore to clear the * cache efficiently without mocking up an address (which requires * access to the checkpointAddr size). */ final long offset = getPhysicalAddress(checkpointAddr); ICommitter ndx = historicalIndexCache.get(offset); if (ndx == null) { /* * Load index from the store. * * Note: Does not set lastCommitTime. */ ndx = Checkpoint .loadFromCheckpoint(this, checkpointAddr, true/* readOnly */); if (log.isTraceEnabled()) log.trace("Adding checkpoint to historical index at " + checkpointAddr); } else { if (log.isTraceEnabled()) log.trace("Found historical index at " + checkpointAddr + ", historicalIndexCache.size(): " + historicalIndexCache.size()); } // Note: putIfAbsent is used to make concurrent requests atomic. ICommitter oldval = historicalIndexCache.putIfAbsent(offset, ndx); if (oldval != null) { /* * If someone beat us to it then use the BTree instance that they * loaded. */ ndx = oldval; } return (ICheckpointProtocol) ndx; } /** * Registers a named index. Once registered the index will participate in * atomic commits. * <p> * Note: A named index must be registered outside of any transaction before * it may be used inside of a transaction. * <p> * Note: You MUST {@link #commit()} before the registered index will be * either restart-safe or visible to new transactions. */ @Override final public void registerIndex(final IndexMetadata metadata) { if (metadata == null) throw new IllegalArgumentException(); final String name = metadata.getName(); if (name == null) throw new IllegalArgumentException(); // Note: Old code path was B+Tree specific. // registerIndex(name, metadata); validateIndexMetadata(name, metadata); // Note: Generic index create code path. final ICheckpointProtocol ndx = Checkpoint.create(this, metadata); // Note: Generic index registration code path. _register(name, ndx); } /** * Provides an opportunity to validate some aspects of the * {@link IndexMetadata} for an index partition. */ protected void validateIndexMetadata(final String name, final IndexMetadata metadata) { // NOP, but extended by the ManagedJournal. } /** * {@inheritDoc} * <p> * Once registered the index will participate in atomic commits. * <p> * Note: A named index must be registered outside of any transaction before * it may be used inside of a transaction. * <p> * Note: You MUST {@link #commit()} before the registered index will be * either restart-safe or visible to new transactions. * * @deprecated by {@link #register(String, IndexMetadata)} */ @Override final public BTree registerIndex(final String name, final IndexMetadata metadata) { validateIndexMetadata(name, metadata); final BTree btree = BTree.create(this, metadata); return registerIndex(name, btree); } /** * Variant method creates and registered a named persistence capable data * structure but does not assume that the data structure will be a * {@link BTree}. * * @param store * The backing store. * @param metadata * The metadata that describes the data structure to be created. * * @return The persistence capable data structure. * * @see Checkpoint#create(IRawStore, IndexMetadata) */ @Override public ICheckpointProtocol register(final String name, final IndexMetadata metadata) { final ICheckpointProtocol ndx = Checkpoint.create(this, metadata); _register(name, ndx); return ndx; } @Override final public BTree registerIndex(final String name, final BTree ndx) { _register(name, ndx); return ndx; } final public void registerIndex(final String name, final HTree ndx) { _register(name, ndx); } /** * Registers a named index (core impl). Once registered the index will * participate in atomic commits. * <p> * Note: A named index must be registered outside of any transaction before * it may be used inside of a transaction. * <p> * Note: You MUST {@link #commit()} before the registered index will be * either restart-safe or visible to new transactions. * * @param name * The name. * @param ndx * The persistence capable data structure. */ final private void _register(final String name, final ICheckpointProtocol ndx) { final ReadLock lock = _fieldReadWriteLock.readLock(); lock.lock(); try { assertOpen(); synchronized (_name2Addr) { // add to the persistent name map. _name2Addr.registerIndex(name, ndx); } } finally { lock.unlock(); } } /** * {@inheritDoc} * <p> * Drops the named index. The index will no longer participate in atomic * commits and will not be visible to new transactions. Storage will be * reclaimed IFF the backing store support that functionality. */ @Override public void dropIndex(final String name) { final ICheckpointProtocol ndx = getUnisolatedIndex(name); if (ndx == null) throw new NoSuchIndexException(name); // final IndexTypeEnum indexType = ndx.getIndexMetadata().getIndexType(); // if (getBufferStrategy() instanceof IRWStrategy) { /* * Reclaim storage associated with the index. */ ndx.removeAll(); // switch (indexType) { // case BTree: // ((AbstractBTree) ndx).removeAll(); // break; // case HTree: // ((HTree) ndx).removeAll(); // break; // default: // throw new AssertionError("Unknown: " + indexType); // } } final ReadLock lock = _fieldReadWriteLock.readLock(); lock.lock(); try { assertOpen(); synchronized (_name2Addr) { // drop from the persistent name map. _name2Addr.dropIndex(name); } } finally { lock.unlock(); } } @Override public Iterator<String> indexNameScan(final String prefix, final long timestamp) { if (timestamp == ITx.UNISOLATED) { /* * For the live Name2Addr index, we get the necessary locks to avoid * concurrent modifications, fully materialize the iterator into a * collection, and then return an iterator over that collection. * This is safe, but not as scaleable. */ final ReadLock lock = _fieldReadWriteLock.readLock(); lock.lock(); try { final List<String> names = new LinkedList<String>(); synchronized (_name2Addr) { final Iterator<String> itr = Name2Addr.indexNameScan( prefix, _name2Addr); while (itr.hasNext()) { names.add(itr.next()); } } return names.iterator(); } finally { lock.unlock(); } } final IIndex n2a; if (timestamp == ITx.READ_COMMITTED) { n2a = getName2Addr(); } else if (TimestampUtility.isReadWriteTx(timestamp)) { final ITx tx = getLocalTransactionManager().getTx(timestamp); if (tx == null) throw new TransactionNotFoundException(timestamp); final long readsOnCommitTime = tx.getReadsOnCommitTime(); n2a = getName2Addr(readsOnCommitTime); } else { n2a = getName2Addr(timestamp); } return Name2Addr.indexNameScan(prefix, n2a); } /** * Return the mutable view of the named index (aka the "live" or * {@link ITx#UNISOLATED} index). This object is NOT thread-safe. You MUST * NOT write on this index unless you KNOW that you are the only writer. See * {@link ConcurrencyManager}, which handles exclusive locks for * {@link ITx#UNISOLATED} indices. * * @return The mutable view of the index. * * @see #getLiveView(String, long) */ @Override final public BTree getIndex(final String name) { return (BTree) getUnisolatedIndex(name); } // /** // * Return the mutable view of the named index (aka the "live" or // * {@link ITx#UNISOLATED} index). This object is NOT thread-safe. You MUST // * NOT write on this index unless you KNOW that you are the only writer. See // * {@link ConcurrencyManager}, which handles exclusive locks for // * {@link ITx#UNISOLATED} indices. // * // * @return The mutable view of the index. // * // * @see #getUnisolatedIndex(String) // * // * @deprecated Use {@link #getUnisolatedIndex(String)} // */ // @Deprecated // final public HTree getHTree(final String name) { // // return (HTree) getUnisolatedIndex(name); // // } // /** // * Return the mutable view of the named index (aka the "live" or // * {@link ITx#UNISOLATED} index). This object is NOT thread-safe. You MUST // * NOT write on this index unless you KNOW that you are the only writer. See // * {@link ConcurrencyManager}, which handles exclusive locks for // * {@link ITx#UNISOLATED} indices. // * // * @return The mutable view of the index. // * // * @see #getLiveView(String, long) // */ // final public Stream getStream(final String name) { // // return (Stream) getUnisolatedIndex(name); // // } /** * Return the mutable view of the named persistence capable data structure * (aka the "live" or {@link ITx#UNISOLATED} view). * * @return The mutable view of the persistence capable data structure. */ @Override final public ICheckpointProtocol getUnisolatedIndex(final String name) { final ReadLock lock = _fieldReadWriteLock.readLock(); lock.lock(); try { assertOpen(); if (name == null) throw new IllegalArgumentException(); if (Thread.interrupted()) { throw new RuntimeException(new InterruptedException()); } // Note: NullPointerException can be thrown here if asynchronously // closed (should be fixed by the ReadLock). synchronized (_name2Addr) { return _name2Addr.getIndex(name); } } finally { lock.unlock(); } } /* * IAddressManager */ @Override final public long getOffset(long addr) { return _bufferStrategy.getOffset(addr); } @Override final public long getPhysicalAddress(long addr) { return _bufferStrategy.getAddressManager().getPhysicalAddress(addr); } @Override final public int getByteCount(long addr) { return _bufferStrategy.getByteCount(addr); } @Override final public long toAddr(int nbytes, long offset) { return _bufferStrategy.toAddr(nbytes, offset); } @Override final public String toString(long addr) { return _bufferStrategy.toString(addr); } final public int getOffsetBits() { return _bufferStrategy.getOffsetBits(); } /** * The maximum length of a record that may be written on the store. */ final public int getMaxRecordSize() { return _bufferStrategy.getMaxRecordSize(); } /* * High Availability */ /** * The current quorum token or {@value Quorum#NO_QUORUM} if the node is not * part of a {@link Quorum}. * <p> * The state of the field changes when a new quorum is negotiated or when an * existing quorum is broken. However, state changes in this field MUST be * coordinated with the journal in order to cover transitions into the * quorum (blocked read/write requests must be released) and transitions * when the quorum breaks (the current write set must be discarded by the * master). In addition, when there is no quorum, the resynchronization * protocol may affect both the persistent state of the journal and any * state which the journal keeps buffered in memory (record cache, address * translation cache, etc). * <p> * Access to this field is protected by the {@link #_fieldReadWriteLock} but * MUST also be coordinated as described above. */ private volatile long quorumToken = Quorum.NO_QUORUM; protected long getQuorumToken() { return quorumToken; } /** * This method will update the quorum token on the journal and the * associated values <em>as if</em> the service was not joined with a met * quorum. This allows us to handle conditions where we know that the * service will not be joined with the met quorum once it is able to observe * the associated quorum events. However, those events can not be delivered * until the service is connected to zookeeper and one of the common causes * for entering the error state for the HAJournalServer is because the zk * client connection has been closed, hence, no zk events. * * @param newValue * The new value. */ protected void clearQuorumToken(final long newValue) { // Lie to it. final boolean isServiceJoined = false; setQuorumToken2(newValue, isServiceJoined); } protected void setQuorumToken(final long newValue) { // Protect for potential NPE if (quorum == null) return; // This quorum member. final QuorumService<HAGlue> localService = quorum.getClient(); final boolean isServiceJoined = localService != null && localService.isJoinedMember(newValue); setQuorumToken2(newValue, isServiceJoined); } /** * Update the {@link #quorumToken}, {@link #haReadyToken}, and * {@link #hashCode()}. * * @param newValue * The new quorum token value. * @param isServiceJoined * <code>true</code> iff this service is known to be a service * that is joined with the met quorum. */ private void setQuorumToken2(final long newValue, final boolean isServiceJoined) { if (haLog.isInfoEnabled()) log.info("current: " + quorumToken + ", new: " + newValue + ", joined=" + isServiceJoined); // Protect for potential NPE if (quorum == null) return; // The HAQuorumService (if running). final QuorumService<HAGlue> localService; { QuorumService<HAGlue> t; try { t = quorum.getClient(); } catch (IllegalStateException ex) { t = null; } localService = t; } // Figure out the state transitions involved. final QuorumTokenTransitions transitionState = new QuorumTokenTransitions( quorumToken, newValue, isServiceJoined, haReadyToken); if (haLog.isInfoEnabled()) haLog.info(transitionState.toString()); if (transitionState.didBreak) { /* * If the quorum broke then set the token immediately without * waiting for the lock. * * Note: This is a volatile write. We want the break to become * visible as soon as possible in order to fail things which examine * the token. * * TODO Why not clear the haReadyToken and haStatus here as well? * However, those changes are not going to be noticed by threads * awaiting a state change until we do signalAll() and that requires * the lock. * * TODO Why not clear the haReadyToken and haStatus on a leave using * a volatile write? Again, threads blocked in awaitHAReady() would * not notice until we actually take the lock and do signalAll(). */ this.quorumToken = Quorum.NO_QUORUM; } /* * Both a meet and a break require an exclusive write lock. * * TODO: Is this lock synchronization a problem? With token update * delayed on a lock could a second thread process a new token based on * incorrect state since the first thread has not updated the token? For * example: NO_TOKEN -> valid token -> NO_TOKEN */ final WriteLock lock = _fieldReadWriteLock.writeLock(); lock.lock(); try { /** * The following condition tests are slightly confusing, it is not * clear that they represent all real states. * * <pre> * Essentially: * didBreak - abort * didLeaveMetQuorum - abort * didJoinMetQuorum - follower gets rootBlocks * didMeet - just sets token * </pre> * * In addition, there is a case where a service is joined as * perceived by the ZKQuorum but not yet HAReady. If a 2-phase * commit is initiated, then the service will enter an error state * (because it is not yet HAReady). This net-zero change case is * explicitly handled below. */ if (transitionState.didLeaveMetQuorum) { /* * The service was joined with a met quorum. */ quorumToken = newValue; // volatile write. /* * We also need to discard any active read/write tx since there * is no longer a quorum. This will hit both read-only * transactions running on any service (not necessarily the * leader) and read/write transactions if this service was the * old leader. * * Note: We do not need to discard read-only tx since the * committed state should remain valid even when a quorum is * lost. However, it would be a bit odd to leave read-only * transactions running if you could not start a new read-only * because the quorum is not met. */ ((AbstractTransactionService) getLocalTransactionManager() .getTransactionService()).abortAllTx(); /** * Local abort (no quorum, so 2-phase abort not required). * * FIXME HA : Abort the unisolated connection? (esp for group * commit and the NSS level SPARQL and REST API unisolated * operations). Maybe we can wrap the execute of the UpdateTask * and the execution of the REST Mutation API methods in a * well-known ThreadGuard and then do interruptAll() to force * the cancelation of any running task? We could also wrap any * IIndexManagerCallable in HAGlue.submit() with a FutureTask * implementation that uses the appropriate ThreadGuard to * ensure that any unisolated tasks are cancelled (that is * actually overkill since it would not differentiate TX based * operations from unisolated operations - we could also use * that ThreadGuard in AbstractTask). Add unit tests for both * UPDATE and other REST mutation methods. * * @see <a * href="https://sourceforge.net/apps/trac/bigdata/ticket/753" * (HA doLocalAbort() should interrupt NSS requests and * AbstractTasks </a> */ doLocalAbort(); /* * Note: We can not re-cast our vote until our last vote is * widthdrawn. That is currently done by QuorumWatcherBase. So, * we have to wait until we observe that to cast a new vote. */ haReadyToken = Quorum.NO_QUORUM; // volatile write. haStatus = HAStatusEnum.NotReady; // volatile write. haReadyCondition.signalAll(); // signal ALL. } else if (transitionState.didBreak) { /* * Note: [didLeaveMetQuorum] was handled above. So, this else if * only applies to a service that observes a quorum break but * which was not joined with the met quorum. As we were not * joined at the break there is nothing to do save for updating * the token. */ quorumToken = Quorum.NO_QUORUM; // volatile write. haReadyToken = Quorum.NO_QUORUM; // volatile write. haStatus = HAStatusEnum.NotReady; // volatile write. haReadyCondition.signalAll(); // signal ALL. } else if (transitionState.didMeet || transitionState.didJoinMetQuorum) { /** * Either a quorum meet (didMeet:=true) or the service is * joining a quorum that is already met (didJoinMetQuorum). */ final long tmp; quorumToken = newValue; boolean installedRBs = false; final long localCommitCounter = _rootBlock.getCommitCounter(); final boolean isLeader; final boolean isFollower; if (localService.isFollower(newValue)) { isLeader = false; isFollower = true; if (localCommitCounter == 0L) { /* * Take the root blocks from the quorum leader and use * them. */ // Remote interface for the quorum leader. final HAGlue leader = localService.getLeader(newValue); haLog.info("Fetching root block from leader."); final IRootBlockView leaderRB; try { leaderRB = leader .getRootBlock( new HARootBlockRequest(null/* storeUUID */)) .getRootBlock(); } catch (IOException e) { throw new RuntimeException(e); } if (leaderRB.getCommitCounter() == 0L) { /* * Installs the root blocks and does a local abort. * * Note: This code path is only taken when both the * leader and the follower are at commitCounter==0L. * This prevents us from accidentally laying down on * a follower the root blocks corresponding to a * leader that already has committed write sets. */ localService .installRootBlocks( leaderRB.asRootBlock(true/* rootBlock0 */), leaderRB.asRootBlock(false/* rootBlock0 */)); installedRBs = true; } } // ready as follower. tmp = newValue; } else if (localService.isLeader(newValue)) { isLeader = true; isFollower = false; // ready as leader. tmp = newValue; } else { isLeader = false; isFollower = false; // Not ready. tmp = Quorum.NO_QUORUM; } /* * Note: These volatile writes need to occur before we do the * local abort since the readOnly versus readWrite state of the * journal is decided based on the [haStatus]. */ this.haReadyToken = tmp; // volatile write. // volatile write. this.haStatus = isLeader ? HAStatusEnum.Leader : isFollower ? HAStatusEnum.Follower : HAStatusEnum.NotReady; if (!installedRBs) { /** * If we install the RBs, then a local abort was already * done. Otherwise we need to do one now (this covers the * case when setQuorumToken() is called on the leader as * well as cases where the service is either not a follower * or is a follower, but the leader is not at * commitCounter==0L, etc. * * If didJoinMetQuorum==true, then we MUST be leaving the * Resync run state in the HAJournalServer, so should NOT * need to complete a localAbort. * * TODO We should still review this point. If we do not * delete a committed HALog, then why is doLocalAbort() a * problem here? Ah. It is because doLocalAbort() is hooked * by the HAJournalServer and will trigger a serviceLeave() * and a transition to the error state. * * @see <a * href="https://sourceforge.net/apps/trac/bigdata/ticket/695"> * HAJournalServer reports "follower" but is in * SeekConsensus and is not participating in * commits</a> */ if (haLog.isInfoEnabled()) haLog.info("Calling localAbort if NOT didJoinMetQuorum: " + transitionState.didJoinMetQuorum); if (!transitionState.didJoinMetQuorum) { doLocalAbort(); } } haReadyCondition.signalAll(); // signal ALL. } else { /* * Did not (leave|break|meet|join). */ if (haReadyToken != Quorum.NO_QUORUM) { /* * We should not be here if this service is HAReady. */ throw new AssertionError("VOID setToken"); } /* * We are not joined. No change in token or HAReadyToken. * * Note: This can occur (for example) if we are not yet joined * and an error occurs during our attempt to join with a met * quorum. One observed example is when this service is in the * joined[] for zookeeper and therefore is messaged as part of * the GATHER or PREPARE protocols for a 2-phase commit, but the * service is not yet HAReady and therefore enters an error * state rather than completing the 2-phase commit protocol * successfully. When setQuorumToken() is called from the error * handling task, the haReadyToken is already cleared. Unless * the quorum also breaks, the quorum token will be unchanged. * Hence we did not (leave|break|meet|join). */ // Fall through. } } finally { lock.unlock(); } if (haLog.isInfoEnabled()) haLog.info("done: token=" + quorumToken + ", HAReady=" + haReadyToken + ", HAStatus=" + haStatus); } private final Condition haReadyCondition = _fieldReadWriteLock.writeLock().newCondition(); private volatile long haReadyToken = Quorum.NO_QUORUM; /** * Updated with the {@link #haReadyToken}. */ private volatile HAStatusEnum haStatus = HAStatusEnum.NotReady; // /** // * Await the service being ready to partitipate in an HA quorum. The // * preconditions include: // * <ol> // * <li>receiving notice of the quorum token via // * {@link #setQuorumToken(long)}</li> // * <li>The service is joined with the met quorum for that token</li> // * <li>If the service is a follower and it's local root blocks were at // * <code>commitCounter:=0</code>, then the root blocks from the leader have // * been installed on the follower.</li> // * <ol> // * // * @return the quorum token for which the service became HA ready. // */ // final public long awaitHAReady() throws InterruptedException, // AsynchronousQuorumCloseException, QuorumException { // final WriteLock lock = _fieldReadWriteLock.writeLock(); // lock.lock(); // try { // long t = Quorum.NO_QUORUM; // while (((t = haReadyToken) == Quorum.NO_QUORUM) // && getQuorum().getClient() != null) { // haReadyCondition.await(); // } // final QuorumService<?> client = getQuorum().getClient(); // if (client == null) // throw new AsynchronousQuorumCloseException(); // if (!client.isJoinedMember(t)) { // throw new QuorumException(); // } // return t; // } finally { // lock.unlock(); // } // } // /** // * Await the service being ready to partitipate in an HA quorum. The // * preconditions include: // * <ol> // * <li>receiving notice of the quorum token via // * {@link #setQuorumToken(long)}</li> // * <li>The service is joined with the met quorum for that token</li> // * <li>If the service is a follower and it's local root blocks were at // * <code>commitCounter:=0</code>, then the root blocks from the leader have // * been installed on the follower.</li> // * <ol> // * // * @param timeout // * The timeout to await this condition. // * @param units // * The units for that timeout. // * // * @return the quorum token for which the service became HA ready. // */ @Override final public long awaitHAReady(final long timeout, final TimeUnit units) throws InterruptedException, TimeoutException, AsynchronousQuorumCloseException { final WriteLock lock = _fieldReadWriteLock.writeLock(); final long begin = System.nanoTime(); final long nanos = units.toNanos(timeout); long remaining = nanos; if (!lock.tryLock(remaining, TimeUnit.NANOSECONDS)) throw new TimeoutException(); try { // remaining = nanos - (now - begin) [aka elapsed] remaining = nanos - (System.nanoTime() - begin); long t = Quorum.NO_QUORUM; while (((t = haReadyToken) == Quorum.NO_QUORUM) && getQuorum().getClient() != null && remaining > 0) { if (!haReadyCondition.await(remaining, TimeUnit.NANOSECONDS)) throw new TimeoutException(); remaining = nanos - (System.nanoTime() - begin); } final QuorumService<?> client = getQuorum().getClient(); if (client == null) throw new AsynchronousQuorumCloseException(); if (remaining <= 0) throw new TimeoutException(); if (!client.isJoinedMember(t)) { throw new QuorumException(); } return t; } finally { lock.unlock(); } } /** * Returns the current value of the <code>haReadyToken</code> * (non-blocking). */ final public long getHAReady() { /* * Note: In order for this operation to be non-blocking and still ensure * proper visibility of the [haReadyToken], the token MUST be volatile, * the setQuorumToken() method MUST NOT change the value of the * [haReadyToken] until all internal actions have been taken. That is, * until it is willing to do haReadyCondition.signalAll() and release * the lock guarding that Condition. */ return haReadyToken; } /** * A simplified summary of the HA status of the service. This may be used to * reliably decide whether the service is the {@link HAStatusEnum#Leader}, a * {@link HAStatusEnum#Follower}, or {@link HAStatusEnum#NotReady}. This is * exposed both here (an RMI interface) and by the REST API. * * @return The {@link HAStatusEnum} or <code>null</code> if the store is not * associated with a {@link Quorum}. * * @see HAGlue#getHAStatus() */ final public HAStatusEnum getHAStatus() { if (quorum == null) { // Not HA. return null; } return haStatus; } /** * Assert that the {@link #getHAReady()} token has the specified value. * * @param token * The specified value. */ final public void assertHAReady(final long token) throws QuorumException { if (quorum == null) return; if (token != haReadyToken) { throw new QuorumException(HAStatusEnum.NotReady.toString()); } } /** * Install identical root blocks on the journal. This is used for a few * different conditions in HA. * <ol> * <li>When the quorum meets for the first time, we need to take the root * block from the leader and use it to replace both of our root blocks (the * initial root blocks are identical). That will make the root blocks the * same on all quorum members.</li> * <li>REBUILD: When a service goes through an automated disaster recovery, * we need to install new root blocks in order to make the local journal * logically empty. This prevents the service from attempting to interpret * the data on the backing file if there is a restart part way through the * rebuild operation.</li> * </ol> * * FIXME We should also verify the following: * * <pre> * - the DirectBufferPool.INSTANCE has the same buffer * capacity (so there will be room for the write cache * data in the buffers on all nodes). * </pre> * * @see QuorumService#installRootBlocks(IRootBlockView) */ protected void installRootBlocks(final IRootBlockView rootBlock0, final IRootBlockView rootBlock1) { if (rootBlock0 == null) throw new IllegalArgumentException(); if (rootBlock1 == null) throw new IllegalArgumentException(); if (!rootBlock0.isRootBlock0()) throw new IllegalArgumentException(); if (rootBlock1.isRootBlock0()) throw new IllegalArgumentException(); // if (rootBlock0.getCommitCounter() != 0L) // throw new IllegalArgumentException(); // if (rootBlock1.getCommitCounter() != 0L) // throw new IllegalArgumentException(); if (!rootBlock0.getStoreType().equals(rootBlock1.getStoreType())) throw new IllegalArgumentException(); if (!rootBlock0.getUUID().equals(rootBlock1.getUUID())) throw new IllegalArgumentException(); // if (_rootBlock.getCommitCounter() != 0) { // // throw new IllegalStateException(); // // } // final IRootBlockView rootBlock0 = rootBlock; // // final IRootBlockView rootBlock1 = new RootBlockView( // false/* rootBlock0 */, rootBlock0.asReadOnlyBuffer(), checker); final WriteLock lock = _fieldReadWriteLock.writeLock(); lock.lock(); try { // Check the root blocks before we install them. { if (!_rootBlock.getStoreType().equals(rootBlock0.getStoreType())) { /* * The StoreType must agree. */ throw new RuntimeException("Incompatible StoreType: expected=" + _rootBlock.getStoreType() + ", actual=" + rootBlock0.getStoreType()); } } // write root block through to disk and sync. _bufferStrategy.writeRootBlock(rootBlock0, ForceEnum.Force); // write 2nd root block through to disk and sync. _bufferStrategy.writeRootBlock(rootBlock1, ForceEnum.Force); // Choose the "current" root block. _rootBlock = RootBlockUtility.chooseRootBlock(rootBlock0, rootBlock1); // Save resource description (sets value returned by getUUID()). journalMetadata.set(new JournalMetadata(this)); haLog.warn("Installed new root blocks: rootBlock0=" + rootBlock0 + ", rootBlock1=" + rootBlock1); // now reset the store with the root block if (_bufferStrategy instanceof IHABufferStrategy) ((IHABufferStrategy) _bufferStrategy) .resetFromHARootBlock(_rootBlock); /* * We need to reset the backing store with the token for the new quorum. * There should not be any active writers since there was no quorum. * Thus, this should just cause the backing store to become aware of the * new quorum and enable writes. * * Note: This is done using a local abort, not a 2-phase abort. Each * node in the quorum should handle this locally when it sees the quorum * meet event. * * TODO This assumes that a service that is not joined with the quorum * will not go through an _abort(). Such a service will have to go * through the synchronization protocol. If the service is in the * pipeline when the quorum meets, even through it is not joined, and * votes the same lastCommitTime, then it MIGHT see all necessary * replicated writes and if it does, then it could synchronize * immediately. There is basically a data race here. */ doLocalAbort(); } finally { lock.unlock(); } } /** * Local commit protocol (HA). This exists to do a non-2-phase abort * in HA. */ final public void doLocalAbort() { _abort(); } /** * Local commit protocol (HA, offline). * <p> * Note: This is used to support RESTORE by replay of HALog files when * the HAJournalServer is offline. * * TODO This method should be protected. If we move the HARestore class * into this package, then it can be changed from public to protected or * package private. */ final public void doLocalCommit(final IRootBlockView rootBlock) { doLocalCommit(null/* localService */, rootBlock); } /** * Local commit protocol (HA). * * @param localService * For HA modes only. When non-<code>null</code>, this is used to * identify whether the service is the leader. When the service * is not the leader, we need to do some additional work to * maintain the {@link IRWStrategy} allocators in synch at each * commit point. * @param rootBlock * The new root block. */ protected void doLocalCommit(final QuorumService<HAGlue> localService, final IRootBlockView rootBlock) { final WriteLock lock = _fieldReadWriteLock.writeLock(); lock.lock(); try { /* * Note: flush() is done by prepare2Phase(). The only conditions * under which it is not done already is (a) HARestore (when * localService is null) and (b) during RESTORE or RESYNC for the * HAJournalServer (when haStatus will be NotReady). */ final boolean shouldFlush = localService == null || (haStatus == null || haStatus == HAStatusEnum.NotReady); /* * Force application data to stable storage _before_ we update the * root blocks. This option guarantees that the application data is * stable on the disk before the atomic commit. Some operating * systems and/or file systems may otherwise choose an ordered write * with the consequence that the root blocks are laid down on the * disk before the application data and a hard failure could result * in the loss of application data addressed by the new root blocks * (data loss on restart). * * Note: We do not force the file metadata to disk. If that is done, * it will be done by a force() after we write the root block on the * disk. * * Note: [shouldFlush] is probably sufficient. This test uses * [shouldFlush||true] to err on the side of safety. */ if ((shouldFlush || true) && doubleSync) { _bufferStrategy.force(false/* metadata */); } // The timestamp for this commit point. final long commitTime = rootBlock.getLastCommitTime(); // write the root block on to the backing store. _bufferStrategy.writeRootBlock(rootBlock, forceOnCommit); // set the new root block. _rootBlock = rootBlock; final boolean leader = localService == null ? false : localService .isLeader(rootBlock.getQuorumToken()); if (leader) { if (_bufferStrategy instanceof IRWStrategy) { /* * Now the root blocks are down we can commit any transient * state. */ ((IRWStrategy) _bufferStrategy).postCommit(); } } else { /* * Ensure allocators are synced after commit. This is only done * for the followers. The leader has been updating the in-memory * allocators as it lays down the writes. The followers have not * be updating the allocators. */ if (haLog.isInfoEnabled() && localService != null) haLog.info("PostHACommit: serviceUUID=" + localService.getServiceId()); /** * Call to sync any transient state */ ((IHABufferStrategy) _bufferStrategy) .postHACommit(rootBlock); /* * Clear reference and reload from the store. * * The leader does not need to do this since it is writing on * the unisolated commit record index and thus the new commit * record is already visible in the commit record index before * the commit. However, the follower needs to do this since it * will otherwise not see the new commit points. */ _commitRecordIndex = _getCommitRecordIndex(); } // reload the commit record from the new root block. _commitRecord = _getCommitRecord(); if (txLog.isInfoEnabled()) txLog.info("COMMIT: commitCounter=" + rootBlock.getCommitCounter() + ", commitTime=" + commitTime); } finally { lock.unlock(); } } /** * The current {@link Quorum} (if any). */ private final Quorum<HAGlue,QuorumService<HAGlue>> quorum; /** * Used to pin the {@link Future} of the gather operation on the client * to prevent it from being finalized while the leader is still running * its side of the consensus protocol to update the release time for the * replication cluster. * * @see #gatherMinimumVisibleCommitTime(IHAGatherReleaseTimeRequest) * * @see <a href="https://sourceforge.net/apps/trac/bigdata/ticket/673" > * Native thread leak in HAJournalServer process </a> */ private final AtomicReference<Future<IHANotifyReleaseTimeResponse>> gatherFuture = new AtomicReference<Future<IHANotifyReleaseTimeResponse>>(); // /** // * The {@link Quorum} for this service -or- <code>null</code> if the service // * is not running with a quorum. // */ @Override public Quorum<HAGlue,QuorumService<HAGlue>> getQuorum() { return quorum; } /** * Factory for the {@link HADelegate} object for this * {@link AbstractJournal}. The object returned by this method will be made * available using {@link QuorumMember#getService()}. * * @throws UnsupportedOperationException * always. */ protected HAGlue newHAGlue(final UUID serviceId) { throw new UnsupportedOperationException(); } /** * Return both root blocks (atomically - used by HA). * <p> * Note: This takes a lock to ensure that the root blocks are consistent * with a commit point on the backing store. */ protected IRootBlockView[] getRootBlocks() { final Lock lock = _fieldReadWriteLock.readLock(); lock.lock(); try { final ChecksumUtility checker = ChecksumUtility.getCHK(); final IRootBlockView rb0 = new RootBlockView( true/* rootBlock0 */, getBufferStrategy() .readRootBlock(true/* rootBlock0 */), checker); final IRootBlockView rb1 = new RootBlockView( false/* rootBlock0 */, getBufferStrategy() .readRootBlock(false/* rootBlock0 */), checker); return new IRootBlockView[] { rb0, rb1 }; } finally { lock.unlock(); } } /** * With lock held to ensure that there is no concurrent commit, copy * key data atomically to ensure recovered snapshot is consistent with * the commit state when the snapshot is taken. This atomic data snapshot * can be merged with the file data to ensure a valid new store copy. * <p> * If this is not done then it is possible for the allocation data - both * metabits and fixed allocator commit bits - to be overwritten and inconsistent * with the saved root blocks. * * @throws IOException */ public ISnapshotData snapshotAllocationData(final AtomicReference<IRootBlockView> rbv) throws IOException { final Lock lock = _fieldReadWriteLock.readLock(); lock.lock(); try { final ISnapshotData tm = new SnapshotData(); final IBufferStrategy bs = getBufferStrategy(); // clone rootblocks final ByteBuffer rb0 = bs.readRootBlock(true/*is rb0*/); tm.put((long) FileMetadata.OFFSET_ROOT_BLOCK0, BytesUtil.toArray(rb0)); final ByteBuffer rb1 = bs.readRootBlock(false/*is rb0*/); tm.put((long) FileMetadata.OFFSET_ROOT_BLOCK1, BytesUtil.toArray(rb1)); // return last commitCounter final IRootBlockView rbv0 = new RootBlockView(true/* rootBlock0 */, rb0, checker); final IRootBlockView rbv1 = new RootBlockView(false/* rootBlock0 */, rb1, checker); rbv.set(RootBlockUtility.chooseRootBlock(rbv0, rbv1)); // Disabling this test allows demonstration of the need to atomically snapshot the metabits and allocators // for the RWStore in conjunction with TestHA1SnapshotPolicy.test_snapshot_stressMultipleTx_restore_validate if (bs instanceof RWStrategy) { final RWStore rws = ((RWStrategy) bs).getStore(); // get metabits rws.snapshotMetabits(tm); // get committed allocations rws.snapshotAllocators(tm); } return tm; } finally { lock.unlock(); } } /** * Implementation hooks into the various low-level operations required to * support HA for the journal. */ protected class BasicHA implements HAGlue { private final UUID serviceId; private final InetSocketAddress writePipelineAddr; protected BasicHA(final UUID serviceId, final InetSocketAddress writePipelineAddr) { if (serviceId == null) throw new IllegalArgumentException(); if (writePipelineAddr == null) throw new IllegalArgumentException(); this.serviceId = serviceId; this.writePipelineAddr = writePipelineAddr; } /** * The most recent prepare request. */ private final AtomicReference<IHA2PhasePrepareMessage> prepareRequest = new AtomicReference<IHA2PhasePrepareMessage>(); /** * Whether or not we voted "yes" for the last prepare request. */ private final AtomicBoolean vote = new AtomicBoolean(false); /** * Return the backing {@link IIndexManager} (non-RMI method). */ public AbstractJournal getIndexManager() { return AbstractJournal.this; } @Override public UUID getServiceId() { return serviceId; } @Override public InetSocketAddress getWritePipelineAddr() { return writePipelineAddr; } @Override public int getNSSPort() { throw new UnsupportedOperationException(); } @Override public RunState getRunState() { throw new UnsupportedOperationException(); } @Override public String getExtendedRunState() { throw new UnsupportedOperationException(); } @Override public HAStatusEnum getHAStatus() { return AbstractJournal.this.getHAStatus(); } @Override public long awaitHAReady(final long timeout, final TimeUnit units) throws AsynchronousQuorumCloseException, InterruptedException, TimeoutException { return AbstractJournal.this.awaitHAReady(timeout, units); } /** * {@inheritDoc} * * FIXME awaitServiceJoin() is failing to set the commitCounter on the * message. Either create a new message type or return the right * message. The joining service should be able to verify that the * release time is applicable for its commit counter (in * {@link AbstractHATransactionService#runWithBarrierLock(Runnable)}. */ @Override public IHANotifyReleaseTimeResponse awaitServiceJoin( final IHAAwaitServiceJoinRequest req) throws AsynchronousQuorumCloseException, InterruptedException, TimeoutException { /* * Note: Lock makes this operation MUTEX with a critical section in * commitNow(). */ _gatherLock.lock(); try { final UUID serviceUUID = req.getServiceUUID(); final long begin = System.nanoTime(); final long nanos = req.getUnit().toNanos(req.getTimeout()); long remaining = nanos; while ((remaining = nanos - (System.nanoTime() - begin)) > 0) { final UUID[] joined = getQuorum().getJoined(); for (UUID t : joined) { if (serviceUUID.equals(t)) { /* * Found it. * * FIXME This should be returning the commitCounter * associated with the most recent gather, not -1L. */ if (log.isInfoEnabled()) log.info("Found Joined Service: " + serviceUUID); final JournalTransactionService ts = (JournalTransactionService) getLocalTransactionManager() .getTransactionService(); return new HANotifyReleaseTimeResponse( ts.getReleaseTime(), -1); } } // remaining := nanos - elapsed remaining = nanos - (System.nanoTime() - begin); if (remaining > 0) { final long sleepMillis = Math .min(TimeUnit.NANOSECONDS.toMillis(remaining), 10/* ms */); if (sleepMillis <= 0) { /* * If remaining LT 1 ms, then fail fast. */ throw new TimeoutException(); } Thread.sleep(sleepMillis); } } // timeout. throw new TimeoutException(); } finally { _gatherLock.unlock(); } } @Override public IHADigestResponse computeDigest(final IHADigestRequest req) throws IOException, NoSuchAlgorithmException, DigestException { throw new UnsupportedOperationException(); } @Override public IHALogDigestResponse computeHALogDigest( final IHALogDigestRequest req) throws IOException, NoSuchAlgorithmException, DigestException { throw new UnsupportedOperationException(); } @Override public IHASnapshotDigestResponse computeHASnapshotDigest( final IHASnapshotDigestRequest req) throws IOException, NoSuchAlgorithmException, DigestException { throw new UnsupportedOperationException(); } // @Override // public Future<Void> globalWriteLock(final IHAGlobalWriteLockRequest req) // throws IOException, TimeoutException, InterruptedException { // // throw new UnsupportedOperationException(); // // } @Override public Future<IHASnapshotResponse> takeSnapshot( final IHASnapshotRequest req) throws IOException { throw new UnsupportedOperationException(); } @Override public Future<Void> rebuildFromLeader(IHARemoteRebuildRequest req) throws IOException { throw new UnsupportedOperationException(); } /** * Return a proxy object for a {@link Future} suitable for use in an RMI * environment (the default implementation returns its argument). * * @param future * The future. * * @return The proxy for that future. */ final protected <E> Future<E> getProxy(final Future<E> future) { return getProxy(future, false/* asyncFuture */); } /** * Return a proxy object for a {@link Future} suitable for use in an RMI * environment (the default implementation returns its argument). * * @param future * The future. * @param asyncFuture * When <code>true</code>, the service should not wait for * the {@link Future} to complete but should return a proxy * object that may be used by the client to monitor or cancel * the {@link Future}. When <code>false</code>, the method * should wait for the {@link Future} to complete and then * return a "thick" {@link Future} which wraps the completion * state but does not permit asynchronous monitoring or * cancellation of the operation wrapped by the * {@link Future}. * * @return The proxy for that future. */ protected <E> Future<E> getProxy(final Future<E> future, final boolean asyncFuture) { return future; } @Override public Future<Boolean> prepare2Phase( final IHA2PhasePrepareMessage prepareMessage) { if (prepareMessage == null) throw new IllegalArgumentException(); final boolean isRootBlock0 = prepareMessage.isRootBlock0(); final long timeout = prepareMessage.getTimeout(); final TimeUnit unit = prepareMessage.getUnit(); final IRootBlockView rootBlock = prepareMessage.getRootBlock(); if (haLog.isInfoEnabled()) haLog.info("isJoinedService=" + prepareMessage.isJoinedService() + ", isRootBlock0=" + isRootBlock0 + ", rootBlock=" + rootBlock + ", timeout=" + timeout + ", unit=" + unit); // the quorum token from the leader is in the root block. final long prepareToken = rootBlock.getQuorumToken(); // Do not prepare if the token is wrong. quorum.assertQuorum(prepareToken); assertHAReady(prepareToken); // Save off a reference to the prepare request. prepareRequest.set(prepareMessage); // Clear vote (assume NO unless proven otherwise). vote.set(false); // Note: Can throw IllegalStateException (if not running). final QuorumService<HAGlue> quorumService = quorum.getClient(); // Note: as decided by the leader! final boolean isJoined = prepareMessage.isJoinedService(); // true the token is valid and this service is the quorum leader final boolean isLeader = quorumService.isLeader(prepareToken); final FutureTask<Boolean> ft; if (!isJoined) { /* * A NOP task if this service is not joined with the met quorum. */ ft = new FutureTaskMon<Boolean>(new VoteNoTask(quorumService)); } else { /* * A task to flush and sync if the service is joined with the * met quorum. * * Note: This code path is only when [isJoined := true]. */ ft = new FutureTaskMon<Boolean>(new Prepare2PhaseTask(isLeader, prepareMessage)); } if (isLeader) { /* * Run in the caller's thread. * * Note: In order to avoid deadlock, when the leader calls back to * itself it MUST do so in the same thread in which it is already * holding the writeLock. [Actually, we do not obtain the writeLock * in prepare2Phase, but all the other quorum commit methods do.] */ ft.run(); } else { /* * We can't really handle the timeout in the leader's thread * (and it would be very odd if the leader wound up waiting on * itself!) but the followers can obey the timeout semantics for * prepare() by execute()ing the FutureTask and then returning * it immediately. The leader can wait on the Future up to the * timeout and then cancel the Future. */ // submit. getExecutorService().execute(ft); } return getProxy(ft); } /** * Task votes NO (unconditional). * <p> * Note: If we were not joined at the start of the 2-phase commit, then * we will not participate. This provides an atomic decision point with * respect to when a service that is rebuilding or resynchronizing will * participate in a 2-phase commit. By voting NO here, the * commit2Phase() operation will be a NOP for THIS service. * <p> * Note: The vote of a service that was not joined with the met quorum * at the time that we begin the 2-phase commit protocol is ignored. */ protected class VoteNoTask implements Callable<Boolean>{ private final QuorumService<HAGlue> localService; public VoteNoTask(final QuorumService<HAGlue> localService) { this.localService = localService; } @Override public Boolean call() throws Exception { // Vote NO. vote.set(false); final IHA2PhasePrepareMessage req = prepareRequest.get(); doLocalAbort(); if (req.isJoinedService()) { /* * Force a service that was joined at the atomic decision * point of the 2-phase commit protocol to do a service * leave. */ if (localService != null) { localService.enterErrorState(); } } return vote.get(); } } // class VoteNoTask /** * Task prepares for a 2-phase commit (syncs to the disk) and votes YES * iff if is able to prepare successfully. * <p> * Note: This code path is only when [isJoined := true]. */ private class Prepare2PhaseTask implements Callable<Boolean> { private final boolean isLeader; private final IHA2PhasePrepareMessage prepareMessage; public Prepare2PhaseTask(// final boolean isLeader,// final IHA2PhasePrepareMessage prepareMessage) { if (prepareMessage == null) throw new IllegalArgumentException(); if (!prepareMessage.isJoinedService()) { /* * Only services that are joined as of the atomic decision * point in commitNow() are sent a PREPARE message. */ throw new AssertionError(); } this.isLeader = isLeader; this.prepareMessage = prepareMessage; } /** * Note: This code path is only when [isJoined := true]. */ @Override public Boolean call() throws Exception { QuorumService<HAGlue> localService = null; try { /* * Note: Throws IllegalStateException if quorum has been * terminated. We can't PREPARE if the quorum is terminated. */ localService = quorum.getClient(); return innerCall(); } finally { if (haLog.isInfoEnabled()) haLog.info("VOTE=" + vote.get()); if (!vote.get()) { /** * Since the service refuses to PREPARE we want it to * enter an error state and then figure out whether it * needs to resynchronize with the quorum. * <p> * Note: Entering the error state will cause the local * abort and serviceLeave() actions to be taken, which * is why then have been commented out above. * * @see <a * href="https://sourceforge.net/apps/trac/bigdata/ticket/695"> * HAJournalServer reports "follower" but is in * SeekConsensus and is not participating in * commits</a> */ if (localService != null) { localService.enterErrorState(); } } } } private Boolean innerCall() throws Exception { /* * Get and clear the [gatherFuture]. A service which was * joined at the atomic decision point for the GATHER will * have a non-null Future here. A service which is newly * joined and which joined *after* the GATHER will have a * [null] Future here. If the service participated in the * gather, then we will use this Future to decide if it * should vote NO. If the service joined *after* the GATHER, * then the Future will be [null] and we will ignore it. * * Note: This is checked below. */ final Future<IHANotifyReleaseTimeResponse> oldFuture = gatherFuture .getAndSet(null/* newValue */); if (haLog.isInfoEnabled()) haLog.info("gatherFuture=" + oldFuture); final IRootBlockView rootBlock = prepareMessage.getRootBlock(); if (haLog.isInfoEnabled()) haLog.info("preparedRequest=" + rootBlock + ", isLeader: " + isLeader); if (rootBlock == null) throw new IllegalStateException(); // Validate new root block against current root block. validateNewRootBlock(/* isJoined, */isLeader, AbstractJournal.this._rootBlock, rootBlock); if (haLog.isInfoEnabled()) haLog.info("validated=" + rootBlock); /* * Verify that the local release time is consisent with the * GATHER. */ final IHANotifyReleaseTimeResponse consensusReleaseTime = prepareMessage .getConsensusReleaseTime(); { if (oldFuture != null) { /* * If we ran the GATHER task, then we must await the * outcome of the GATHER on this service before we * can verify that the local consensus release time * is consistent with the GATHER. * * Note: If the oldFuture is null, then the service * just joined and was explicitly handed the * consensus release time and hence should be * consistent here anyway. */ oldFuture.get(); } final long localReleaseTime = getLocalTransactionManager() .getTransactionService().getReleaseTime(); // Note: Per the GatherTask (in Journal.java). final long expectedReleaseTime = Math.max(0L, consensusReleaseTime.getCommitTime() - 1); if (localReleaseTime != expectedReleaseTime) { throw new AssertionError( "Local service does not agree with consensusReleaseTime: localReleaseTime=" + localReleaseTime + ", expectedReleaseTime=" + expectedReleaseTime + ", consensusReleaseTime=" + consensusReleaseTime + ", serviceId=" + getServiceId()); } } /* * if(follower) {...} */ if (/*isJoined &&*/ !isLeader) { /** * This is a follower. * * Validate the release time consensus protocol was * completed successfully on the follower. * * @see <a * href="https://sourceforge.net/apps/trac/bigdata/ticket/673" * > Native thread leak in HAJournalServer process </a> */ if (!prepareMessage.isGatherService()) { /* * This service did not participate in the GATHER. * Instead, it joined after the GATHER but before * the PREPARE. */ // [gatherFuture] should have been [null]. assert oldFuture == null; vote.set(true); // Done. return vote.get(); } /** * Note: We need to block here (on oldFuture.get()) in * case the follower has not finished applying the * updated release time. */ try { // Note: [oldFuture] MUST be non-null! final IHANotifyReleaseTimeResponse tmp = oldFuture.get(); if ((tmp.getCommitCounter() != consensusReleaseTime .getCommitCounter()) || (tmp.getCommitTime() != consensusReleaseTime .getCommitTime())) { throw new AssertionError( "GatherTask reports different consensus: GatherTask=" + tmp + ", consensusReleaseTime=" + consensusReleaseTime); } /* * Gather was successful - fall through. */ } catch (InterruptedException e) { /* * Note: Future isDone(). Caller should not block. */ throw new AssertionError(); } catch (CancellationException e) { /* * Gather cancelled on the follower (e.g., * immediately above). */ haLog.error("Gather cancelled on follower: serviceId=" + getServiceId() + " : " + e, e); return vote.get(); } catch (ExecutionException e) { // Gather failed on the follower. haLog.error("Gather failed on follower: serviceId=" + getServiceId() + " : " + e, e); return vote.get(); } } /* * Call to ensure strategy does everything required for itself * before final root block commit. At a minimum it must flush * its write cache to the backing file (issue the writes). */ // _bufferStrategy.commit(); // lifted to before we // retrieve // RootBlock in commitNow /* * Force application data to stable storage _before_ we update * the root blocks. This option guarantees that the application * data is stable on the disk before the atomic commit. Some * operating systems and/or file systems may otherwise choose an * ordered write with the consequence that the root blocks are * laid down on the disk before the application data and a hard * failure could result in the loss of application data * addressed by the new root blocks (data loss on restart). * * Note: We do not force the file metadata to disk. If that is * done, it will be done by a force() after we write the root * block on the disk. */ if (doubleSync) { _bufferStrategy.force(false/* metadata */); } if (prepareMessage.voteNo()) { /* * Hook allows the test suite to force a NO vote. */ throw new Mock2PhaseCommitProtocolException("Force NO vote"); } // Vote YES. vote.set(true); return vote.get(); } } /** * Validate the new root block against the current root block. This * method checks a variety of invariants: * <ul> * <li>The UUID of the store must be the same.</li> * <li>The commitTime must be strictly increasing.</li> * <li>The commitCounter must increase by ONE (1).</li> * <li></li> * </ul> * Note: This code path is only when [isJoined := true] (that is, the * service was joined with the met quorum at the atomic decision point * for the joined set for the 2-phase commit). * * @param isLeader * iff this service is the leader for this commit. * @param oldRB * the old (aka current) root block. * @param newRB * the new (aka proposed) root block. */ // * @param isJoined // * iff this service was joined at the atomic decision point // * in the 2-phase commit protocol. protected void validateNewRootBlock(//final boolean isJoined, final boolean isLeader, final IRootBlockView oldRB, final IRootBlockView newRB) { if (oldRB == null) throw new IllegalStateException(); if (newRB == null) throw new IllegalStateException(); // Validate UUID of store is consistent. if (!newRB.getUUID().equals(oldRB.getUUID())) { /* * The root block has a different UUID. We can not accept this * condition. */ throw new IllegalStateException("Store UUID: old=" + oldRB.getUUID() + " != new=" + newRB.getUUID()); } // Validate commit time is strictly increasing. if (newRB.getLastCommitTime() <= oldRB.getLastCommitTime()) { /* * The root block has a commit time that is LTE the most recent * commit on this Journal. We can not accept this condition. */ throw new IllegalStateException("lastCommitTime: old=" + oldRB.getLastCommitTime() + " > new=" + newRB.getLastCommitTime()); } // Validate the new commit counter. { final long newcc = newRB.getCommitCounter(); final long oldcc = oldRB.getCommitCounter(); if (newcc != (oldcc + 1)) { /* * The new root block MUST have a commit counter that is ONE * more than the current commit counter on this Journal. We * can not accept any other value for the commit counter. */ throw new IllegalStateException("commitCounter: ( old=" + oldcc + " + 1 ) != new=" + newcc); } } // The quorum token from the leader is in the root block. final long prepareToken = newRB.getQuorumToken(); // Verify that the same quorum is still met. quorum.assertQuorum(prepareToken); // Verify HA ready for that token. assertHAReady(prepareToken); // Note: Throws IllegalStateException if quorum not running. final QuorumService<HAGlue> localService = quorum.getClient(); if (isLeader) { /* * Verify still the leader. */ if (!localService.isLeader(prepareToken)) throw new IllegalStateException("Not leader."); final HAStatusEnum st = getHAStatus(); if (!HAStatusEnum.Leader.equals(st)) { throw new IllegalStateException("HAStatusEnum: expected=" + HAStatusEnum.Leader + ", actual=" + st); } } else { /* * Verify still a follower. */ if (!localService.isFollower(prepareToken)) throw new IllegalStateException("Not follower."); final HAStatusEnum st = getHAStatus(); if (!HAStatusEnum.Follower.equals(st)) { throw new IllegalStateException("HAStatusEnum: expected=" + HAStatusEnum.Follower + ", actual=" + st); } } final long tmp = getHAReady(); if (prepareToken != tmp) { throw new IllegalStateException("HAReadyToken: expected=" + prepareToken + ", actual=" + tmp); } } @Override public Future<Void> commit2Phase( final IHA2PhaseCommitMessage commitMessage) { final FutureTask<Void> ft = new FutureTaskMon<Void>( new Commit2PhaseTask(commitMessage), null/* Void */); /* * Run in the caller's thread. * * Note: In order to avoid deadlock, when the leader calls back to * itself it MUST do so in the same thread in which it is already * holding the writeLock. */ ft.run(); return getProxy(ft); } /** * 2-Phase commit (service must have voted YES for the 2-phase prepare). */ private class Commit2PhaseTask implements Runnable { private final IHA2PhaseCommitMessage commitMessage; public Commit2PhaseTask(final IHA2PhaseCommitMessage commitMessage) { if (commitMessage == null) throw new IllegalArgumentException(); this.commitMessage = commitMessage; } @Override public void run() { QuorumService<HAGlue> localService = null; _fieldReadWriteLock.writeLock().lock(); try { /* * Note: Throws IllegalStateException if quorum has been * terminated. We can't go through the 2-phase commit if the * quorum is terminated. */ localService = quorum.getClient(); doInnerRun(localService); } catch (Throwable t) { try { haLog.error("ERROR IN 2-PHASE COMMIT: " + t + ", rootBlock=" + prepareRequest.get().getRootBlock(), t); } catch (Throwable t2) { log.error(t2, t2); } if (localService != null) { localService.enterErrorState(); } // always rethrow the root cause exception. throw new RuntimeException(t); } finally { // Discard the prepare request. prepareRequest.set(null/* discard */); // Discard the vote. vote.set(false); _fieldReadWriteLock.writeLock().unlock(); } } private void doInnerRun(final QuorumService<HAGlue> localService) throws Exception { final IHA2PhasePrepareMessage prepareMessage = prepareRequest .get(); if (prepareMessage == null) throw new IllegalStateException(); if (!prepareMessage.isJoinedService()) { /* * Only services that are joined as of the atomic decision * point should receive a PREPARE or COMMIT message. */ throw new AssertionError(); } // Note: Could throw ChecksumError. final IRootBlockView rootBlock = prepareMessage == null ? null : prepareMessage.getRootBlock(); final long commitTime = commitMessage.getCommitTime(); if (rootBlock == null) throw new IllegalStateException(); if (haLog.isInfoEnabled()) haLog.info("commitTime=" + commitTime + ", commitCounter=" + prepareMessage.getRootBlock() .getCommitCounter() + ", vote=" + vote); if (rootBlock.getLastCommitTime() != commitTime) { /* * The commit time does not agree with the root * block from the prepare message. */ throw new IllegalStateException(); } if (!vote.get()) { /* * This service voted NO. It will not participate in the * commit. */ haLog.warn("IGNORING COMMIT2PHASE"); return; } // verify that the qourum has not changed. quorum.assertQuorum(rootBlock.getQuorumToken()); if (commitMessage.failCommit_beforeWritingRootBlockOnJournal()) { throw new Mock2PhaseCommitProtocolException(); } /* * Write the root block on the local journal. */ AbstractJournal.this.doLocalCommit(localService, rootBlock); if (commitMessage.failCommit_beforeClosingHALog()) { throw new Mock2PhaseCommitProtocolException(); } /* * Write the root block on the HALog file, closing out that * file. */ localService.logRootBlock(rootBlock); if (commitMessage.didAllServicesPrepare()) { /* * The HALog files are conditionally purged (depending * on the IRestorePolicy) on each node any time the * quorum is fully met and goes through a commit point. * The current HALog always remains open. * * Note: This decision needs to be made in awareness of * whether all services voted to PREPARE. Otherwise we * can hit a problem where some service did not vote to * prepare, but the other services did, and we wind up * purging the HALogs even though one of the services * did not go through the commit2Phase(). This issue is * fixed by the didAllServicesPrepare() flag. */ localService.purgeHALogs(rootBlock.getQuorumToken()); } } // doInnerRun() } // Commit2PhaseTask @Override public Future<Void> abort2Phase(final IHA2PhaseAbortMessage abortMessage) { final FutureTask<Void> ft = new FutureTaskMon<Void>( new Abort2PhaseTask(abortMessage), null/* Void */); /* * Run in the caller's thread. * * Note: In order to avoid deadlock, when the leader calls back to * itself it MUST do so in the same thread in which it is already * holding the writeLock. */ ft.run(); return getProxy(ft); } /** * 2-Phase abort. */ private class Abort2PhaseTask implements Runnable { private final IHA2PhaseAbortMessage abortMessage; public Abort2PhaseTask(final IHA2PhaseAbortMessage abortMessage) { if (abortMessage == null) throw new IllegalArgumentException(); this.abortMessage = abortMessage; } @Override public void run() { try { // Discard the prepare request. prepareRequest.set(null/* discard */); // Discard the vote. vote.set(false); final long token = abortMessage.getQuorumToken(); if (haLog.isInfoEnabled()) haLog.info("token=" + token); /* * Note: even if the quorum breaks, we still need to discard * our local state. Forcing doLocalAbort() here is MUCH * safer than failing to invoke it because the quorum * broken. If we do not invoke doLocalAbort() then we could * have an old write set laying around on the journal and it * might accidentally get flushed through with a local * commit. if we always force the local abort here, then the * worst circumstance would be if a 2-phase abort message * for a historical quorum state were somehow delayed and * arrived after we entered a new quorum state. Forcing an * abort under that weird (perhaps impossible) circumstance * will just cause this service to drop out of the quorum if * it later observes a write with the wrote block sequence * or commit counter. This seems like a safe decision. */ //quorum.assertQuorum(token); // REMOVED per comment above. } finally { // ALWAYS go through the local abort. doLocalAbort(); } } } /** * {@inheritDoc} * * @todo We should test the LRUNexus for failover reads and install * records into the cache if there is a cache miss. Unfortunately * the cache holds objects, some of which declare how to access * the underlying {@link IDataRecord} using the * {@link IDataRecordAccess} interface. * * @todo Since these are rare events it may not be worthwhile to setup a * separate low-level socket service to send/receive the data. */ @Override public Future<IHAReadResponse> readFromDisk( final IHAReadRequest msg) { final long token = msg.getQuorumToken(); // final UUID storeId = msg.getStoreUUID(); final long addr = msg.getAddr(); if (haLog.isInfoEnabled()) haLog.info("token=" + token + ", addr=" + addr); final FutureTask<IHAReadResponse> ft = new FutureTask<IHAReadResponse>( new Callable<IHAReadResponse>() { @Override public IHAReadResponse call() throws Exception { if (haLog.isInfoEnabled()) haLog.info("token=" + token); quorum.assertQuorum(token); // final ILRUCache<Long, Object> cache = (LRUNexus.INSTANCE // == null) ? null // : LRUNexus.getCache(jnl); // // Object obj = cache.get(addr); // // if(obj != null && obj instanceof IDataRecordAccess) { // // return ((IDataRecordAccess)obj).data(); // // } // read from the local store. final ByteBuffer b = ((IHABufferStrategy) getBufferStrategy()) .readFromLocalStore(addr); final byte[] a = BytesUtil.toArray(b); // cache.putIfAbsent(addr, b); return new HAReadResponse(a); } }); ft.run(); return getProxy(ft); } /* * Delegated to HAQuorumService. */ @Override public Future<Void> receiveAndReplicate(final IHASyncRequest req, final IHASendState snd, final IHAWriteMessage msg) throws IOException { if (haLog.isDebugEnabled()) haLog.debug("req=" + req + ", msg=" + msg); final Future<Void> ft = quorum.getClient().receiveAndReplicate(req, snd, msg); return getProxy(ft); } /* * This is implemented by HAJournal, which is responsible for * maintaining the HA Log files. */ @Override public IHALogRootBlocksResponse getHALogRootBlocksForWriteSet( IHALogRootBlocksRequest msg) throws IOException { throw new UnsupportedOperationException(); } /* * This is implemented by HAJournal, which is responsible for * maintaining the HA Log files. */ @Override public Future<Void> sendHALogForWriteSet(IHALogRequest msg) throws IOException { throw new UnsupportedOperationException(); } /* * This is implemented by HAJournal. */ @Override public Future<IHASendStoreResponse> sendHAStore(IHARebuildRequest msg) throws IOException { throw new UnsupportedOperationException(); } @Override public IHAWriteSetStateResponse getHAWriteSetState( final IHAWriteSetStateRequest req) { final long token = getQuorum().token(); // Verify leader for that token. getQuorum().assertLeader(token); /* * Note: This lock will prevent concurrent commit so the * commitCounter is known to be valid for the blockSequence. */ final IHAWriteSetStateResponse resp; final Lock lock = _fieldReadWriteLock.readLock(); lock.lock(); try { final IRootBlockView rb = _rootBlock; final long sequence = ((IHABufferStrategy) getBufferStrategy()) .getCurrentBlockSequence(); resp = new HAWriteSetStateResponse(rb.getCommitCounter(), rb.getLastCommitTime(), sequence); } finally { lock.unlock(); } // Verify still leader for that token. getQuorum().assertLeader(token); return resp; } @Override public IHARootBlockResponse getRootBlock( final IHARootBlockRequest msg) { // storeId is optional (used in scale-out). final UUID storeId = msg.getStoreUUID(); // if (storeId == null) // throw new IllegalArgumentException(); if (haLog.isInfoEnabled()) haLog.info("storeId=" + storeId); if (storeId != null && !getUUID().equals(storeId)) { // A request for a different journal's root block. throw new UnsupportedOperationException(); } if (msg.isNonBlocking()) { // Non-blocking code path. return new HARootBlockResponse( AbstractJournal.this.getRootBlockView()); } else { // Blocking code path. final ReadLock lock = _fieldReadWriteLock.readLock(); lock.lock(); try { return new HARootBlockResponse( AbstractJournal.this.getRootBlockView()); } finally { lock.unlock(); } } } // @Override // public Future<Void> bounceZookeeperConnection() { // final FutureTask<Void> ft = new FutureTaskMon<Void>(new Runnable() { // public void run() { // // NOP (not implemented at this layer). // } // }, null); // ft.run(); // return getProxy(ft); // } // // @Override // public Future<Void> enterErrorState() { // final FutureTask<Void> ft = new FutureTaskMon<Void>(new Runnable() { // public void run() { // // NOP (not implemented at this layer). // } // }, null); // ft.run(); // return getProxy(ft); // } /** * {@inheritDoc} * <p> * This implementation does pipeline remove() followed by pipline add(). */ @Override public Future<Void> moveToEndOfPipeline() { final FutureTask<Void> ft = new FutureTaskMon<Void>(new Runnable() { public void run() { if (haLog.isInfoEnabled()) haLog.info(""); final QuorumActor<?, ?> actor = quorum.getActor(); actor.pipelineRemove(); actor.pipelineAdd(); } }, null/* result */); getExecutorService().execute(ft); return getProxy(ft); } @Override public Future<IHAPipelineResetResponse> resetPipeline( final IHAPipelineResetRequest req) throws IOException { final Future<IHAPipelineResetResponse> f = quorum.getClient() .resetPipeline(req); return getProxy(f); } /* * HATXSGlue. * * Note: API is mostly implemented by Journal/HAJournal. */ // /** // * Clear the {@link #gatherFuture} and return <code>true</code> iff the // * {@link Future} was available, was already done, and the computation // * did not result in an error. Othewise return <code>false</code>. // * <p> // * Note: This is invoked from // * {@link #prepare2Phase(IHA2PhasePrepareMessage)} to determine whether // * the gather operation on the follower completed normally. It is also // * invoked from {@link AbstractJournal#doLocalAbort()} and from // * {@link #gatherMinimumVisibleCommitTime(IHAGatherReleaseTimeRequest)} // * to ensure that the outcome from a previous gather is cleared before a // * new one is attempted. // * // * @see <a href="https://sourceforge.net/apps/trac/bigdata/ticket/673" > // * Native thread leak in HAJournalServer process </a> // */ // private boolean clearGatherOutcome() { // final Future<Void> oldFuture = gatherFuture // .getAndSet(null/* newValue */); // if (oldFuture != null) { // if(!oldFuture.isDone()) { // // Ensure cancelled. // oldFuture.cancel(true/*mayInterruptIfRunning*/); // } // try { // oldFuture.get(); // // Gather was successful. // return true; // } catch (InterruptedException e) { // // Note: Future isDone(). Caller will not block. // throw new AssertionError(); // } catch (ExecutionException e) { // haLog.error("Gather failed on follower: serviceId=" // + getServiceId() + " : " + e, e); // return false; // } // } // // Outcome was not available. // return false; // } /** * {@inheritDoc} * * @see <a * href="https://sourceforge.net/apps/trac/bigdata/ticket/673" * > Native thread leak in HAJournalServer process </a> */ @Override public void gatherMinimumVisibleCommitTime( final IHAGatherReleaseTimeRequest req) throws IOException { if (haLog.isInfoEnabled()) haLog.info("req=" + req); { /* * Clear the old outcome. Reference SHOULD be null. Ensure not * running. */ final Future<IHANotifyReleaseTimeResponse> oldFuture = gatherFuture .getAndSet(null); if (oldFuture != null && !oldFuture.isDone()) oldFuture.cancel(true/* mayInterruptIfRunning */); } /* * Lookup the leader using its UUID. * * Note: We do not use the token to find the leader. If the token is * invalid, then we will handle that once we are in the GatherTask. * * Note: We do this early and pass it into the GatherTask. We can * not send back an RMI response unless we know the leader's proxy. */ final UUID leaderId = req.getLeaderId(); // Note: Will throw exception if our HAQuorumService is not running. final HAGlue leader = getQuorum().getClient().getService(leaderId); if (leader == null) throw new RuntimeException( "Could not discover the quorum leader."); // Get our serviceId. final UUID serviceId = getServiceId(); if (serviceId == null) throw new AssertionError(); final Callable<IHANotifyReleaseTimeResponse> task = ((AbstractHATransactionService) AbstractJournal.this .getLocalTransactionManager().getTransactionService()) .newGatherMinimumVisibleCommitTimeTask(leader, serviceId, req); final FutureTask<IHANotifyReleaseTimeResponse> ft = new FutureTask<IHANotifyReleaseTimeResponse>(task); // Save reference to the gather Future. gatherFuture.set(ft); /* * Fire and forget. The Future is checked by prepare2Phase. * * Note: This design pattern was used to due a DGC thread leak * issue. The gather protocol should be robust even through the * Future is not checked (or awaited) here. */ getExecutorService().execute(ft); return; } @Override public IHANotifyReleaseTimeResponse notifyEarliestCommitTime( final IHANotifyReleaseTimeRequest req) throws IOException, InterruptedException, BrokenBarrierException { /* * Note: Pass through [req] without checks. We need to get this * message to the CyclicBarrier regardless of whether it is * well-formed or valid. */ return ((HATXSGlue) AbstractJournal.this .getLocalTransactionManager().getTransactionService()) .notifyEarliestCommitTime(req); } /** * This exposes the clock used to assign transaction identifiers and * commit times. It is being exposed to support certain kinds of * overrides for unit tests. * <p> * Note: This method is NOT exposed to RMI. However, it can still be * overridden by the unit tests. * * @return The next timestamp from that clock. */ public long nextTimestamp() { try { return AbstractJournal.this.getLocalTransactionManager() .getTransactionService().nextTimestamp(); } catch (IOException ex) { /* * Note: This is a local method call. IOException will not be * thrown. */ throw new RuntimeException(ex); } } /* * IService */ @Override public UUID getServiceUUID() throws IOException { return getServiceId(); } @SuppressWarnings("rawtypes") @Override public Class getServiceIface() throws IOException { return HAGlue.class; } @Override public String getHostname() throws IOException { return AbstractStatisticsCollector.fullyQualifiedHostName; } @Override public String getServiceName() throws IOException { // TODO Configurable service name? return getServiceIface().getName() + "@" + getHostname() + "#" + hashCode(); } @Override public void destroy() throws RemoteException { AbstractJournal.this.destroy(); } @Override public <T> Future<T> submit(final IIndexManagerCallable<T> callable, final boolean asyncFuture) throws IOException { callable.setIndexManager(getIndexManager()); final Future<T> ft = getIndexManager().getExecutorService().submit( callable); return getProxy(ft, asyncFuture); } }; /** * Remove all commit records between the two provided keys. * <p> * This is called from the RWStore when it checks for deferredFrees against * the CommitRecordIndex where the CommitRecords reference the deleteBlocks * that have been deferred. * <p> * Once processed the records for the effected range muct be removed as they * reference invalid states. * * @param fromKey * @param toKey * * @see https://sourceforge.net/apps/trac/bigdata/ticket/440 * @see IHistoryManager#checkDeferredFrees(AbstractJournal) */ public int removeCommitRecordEntries(final byte[] fromKey, final byte[] toKey) { // Use the LIVE indeex! final CommitRecordIndex cri = _commitRecordIndex; @SuppressWarnings("unchecked") final ITupleIterator<CommitRecordIndex.Entry> commitRecords = cri .rangeIterator(fromKey, toKey, 0/* capacity */, IRangeQuery.DEFAULT | IRangeQuery.CURSOR, null/* filter */); int removed = 0; while (commitRecords.hasNext()) { final ITuple<CommitRecordIndex.Entry> t = commitRecords.next(); // Delete the associated ICommitRecord. delete(t.getObject().addr); // Remove the entry for the commit record from the commit record // index. commitRecords.remove(); removed++; } return removed; } public interface ISnapshotEntry { long getAddress(); byte[] getData(); } public interface ISnapshotData { void put(long addr, byte[] data); Iterator<ISnapshotEntry> entries(); } static public class SnapshotData implements ISnapshotData { final TreeMap<Long, byte[]> m_map = new TreeMap<Long, byte[]>(); @Override public void put(long addr, byte[] data) { m_map.put(addr, data); } @Override public Iterator<ISnapshotEntry> entries() { final Iterator<Map.Entry<Long, byte[]>> entries = m_map.entrySet().iterator(); return new Iterator<ISnapshotEntry>() { @Override public boolean hasNext() { return entries.hasNext(); } @Override public ISnapshotEntry next() { final Map.Entry<Long, byte[]> entry = entries.next(); return new ISnapshotEntry() { @Override public long getAddress() { return entry.getKey(); } @Override public byte[] getData() { return entry.getValue(); } }; } @Override public void remove() { entries.remove(); } }; } } @Override public IAllocationContext newAllocationContext(final boolean isolated) { if (_bufferStrategy instanceof RWStrategy) { return ((RWStrategy) _bufferStrategy).newAllocationContext(isolated); } else { return null; } } }