/** Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved. Contact: SYSTAP, LLC DBA Blazegraph 2501 Calvert ST NW #106 Washington, DC 20008 licenses@blazegraph.com This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ package com.bigdata.journal; import java.io.File; import java.io.IOException; import java.nio.ByteBuffer; import java.util.Collection; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.UUID; import java.util.concurrent.BrokenBarrierException; import java.util.concurrent.Callable; import java.util.concurrent.CancellationException; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.CyclicBarrier; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.FutureTask; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.ScheduledFuture; import java.util.concurrent.Semaphore; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicReference; import java.util.concurrent.locks.ReentrantLock; import org.apache.log4j.Logger; import cern.colt.Arrays; import com.bigdata.bfs.BigdataFileSystem; import com.bigdata.bfs.GlobalFileSystemHelper; import com.bigdata.bop.engine.QueryEngine; import com.bigdata.bop.fed.QueryEngineFactory; import com.bigdata.btree.AbstractBTree; import com.bigdata.btree.BTree; import com.bigdata.btree.BTreeCounters; import com.bigdata.btree.BaseIndexStats; import com.bigdata.btree.ILocalBTreeView; import com.bigdata.btree.IndexMetadata; import com.bigdata.btree.IndexSegment; import com.bigdata.btree.ReadCommittedView; import com.bigdata.cache.HardReferenceQueue; import com.bigdata.concurrent.AccessSemaphore; import com.bigdata.concurrent.AccessSemaphore.Access; import com.bigdata.config.IntegerValidator; import com.bigdata.config.LongValidator; import com.bigdata.counters.AbstractStatisticsCollector; import com.bigdata.counters.CounterSet; import com.bigdata.counters.httpd.CounterSetHTTPD; import com.bigdata.ha.HAGlue; import com.bigdata.ha.HAStatusEnum; import com.bigdata.ha.HATXSGlue; import com.bigdata.ha.QuorumService; import com.bigdata.ha.msg.HAGatherReleaseTimeRequest; import com.bigdata.ha.msg.HANotifyReleaseTimeRequest; import com.bigdata.ha.msg.HANotifyReleaseTimeResponse; import com.bigdata.ha.msg.IHAGatherReleaseTimeRequest; import com.bigdata.ha.msg.IHANotifyReleaseTimeRequest; import com.bigdata.ha.msg.IHANotifyReleaseTimeResponse; import com.bigdata.journal.JournalTransactionService.ValidateWriteSetTask; import com.bigdata.quorum.Quorum; import com.bigdata.quorum.QuorumException; import com.bigdata.rawstore.IRawStore; import com.bigdata.rdf.task.IApiTask; import com.bigdata.relation.locator.DefaultResourceLocator; import com.bigdata.relation.locator.ILocatableResource; import com.bigdata.relation.locator.IResourceLocator; import com.bigdata.resources.IndexManager; import com.bigdata.resources.ResourceManager; import com.bigdata.resources.StaleLocatorReason; import com.bigdata.rwstore.IHistoryManager; import com.bigdata.rwstore.IRawTx; import com.bigdata.rwstore.RWStore; import com.bigdata.service.AbstractTransactionService; import com.bigdata.service.DataService; import com.bigdata.service.IBigdataFederation; import com.bigdata.sparse.GlobalRowStoreHelper; import com.bigdata.sparse.SparseRowStore; import com.bigdata.util.DaemonThreadFactory; import com.bigdata.util.InnerCause; import com.bigdata.util.concurrent.LatchedExecutor; import com.bigdata.util.concurrent.ShutdownHelper; import com.bigdata.util.concurrent.ThreadPoolExecutorBaseStatisticsTask; /** * Concrete implementation suitable for a local and unpartitioned database. * <p> * Note: This implementation does NOT not support partitioned indices. Because * all data must reside on a single journal resource there is no point to a * view. Views are designed to have data on a mixture of the live journal, one * or more historical journals, and one or more {@link IndexSegment}s. * * @see ResourceManager, which supports views. */ public class Journal extends AbstractJournal implements IConcurrencyManager, /*ILocalTransactionManager,*/ IResourceManager { /** * Logger. */ static final Logger log = Logger.getLogger(Journal.class); /** * @see http://sourceforge.net/apps/trac/bigdata/ticket/443 (Logger for * RWStore transaction service and recycler) */ private static final Logger txLog = Logger.getLogger("com.bigdata.txLog"); /** * Object used to manage local transactions. */ private final AbstractLocalTransactionManager localTransactionManager; /** * Object used to manage tasks executing against named indices. */ private final ConcurrencyManager concurrencyManager; /** * <code>true</code> iff the journal has been configured to use group commit. */ private final boolean isGroupCommit; /** * Options understood by the {@link Journal}. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a> */ public interface Options extends com.bigdata.journal.Options, com.bigdata.journal.ConcurrencyManager.Options, com.bigdata.journal.TemporaryStoreFactory.Options, com.bigdata.journal.QueueStatsPlugIn.Options, com.bigdata.journal.PlatformStatsPlugIn.Options, com.bigdata.journal.HttpPlugin.Options // Note: Do not import. Forces bigdata-ganglia dependency. // com.bigdata.journal.GangliaPlugIn.Options { /** * The name of an boolean option that conditionally enables group commit * semantics for the Journal based on task-oriented concurrent writers. * This option indicates a <em>contract</em> that the application agrees * to respect. When <code>false</code> the application controls when the * database goes through a commit point by invoking * {@link Journal#commit()}. When <code>true</code> ALL mutation * operations MUST be submitted to the {@link IConcurrencyManager} of the * journal. Note that the journal always has supported group commit - this * is how it is used in scale-out. However, embedded applications have * historically made the decision about when the database should commit. * When you specify this option, you are asserting that your code will * always submit tasks for evaluation using the * {@link IConcurrencyManager}. * <p> * There are several benefits of group commit. * <p> * First, you can have multiple tenants in the same database instance and * the updates for one tenant will no longer block the updates for the * other tenants. Thus, one tenant can be safely running a long running * update and other tenants can still enjoy low latency updates. * <p> * Second, group commit automatically combines a sequence of updates on * one (or more) tenant(s) into a single commit point on the disk. This * provides higher potential throughput. It also means that it is no * longer as important for applications to batch their updates since group * commit will automatically perform some batching. * <p> * Third, writes on independent indices may be independent (this behavior * has been around for a long time). * <p> * There are a few "gotchas" with the group commit support. This is * because commits are decided by {@link IApiTask} or {@link AbstractTask} * completion and tasks are scheduled by the concurrency manager, lock * manager, and write executor service. * <ul> * <li> * Mutation tasks that do not complete normally MUST throw an exception!</li> * <li>Applications MUST NOT call Journal.commit(). Instead, they submit * an IApiTask using AbstractApiTask.submit(). The database will meld the * write set of the task into a group commit sometime after the task * completes successfully.</li> * * @see #566 (NSS GROUP COMMIT). */ String GROUP_COMMIT = Journal.class.getName() + ".groupCommit"; String DEFAULT_GROUP_COMMIT = "false"; /** * The capacity of the {@link HardReferenceQueue} backing the * {@link IResourceLocator} maintained by the {@link Journal}. The * capacity of this cache indirectly controls how many * {@link ILocatableResource}s the {@link Journal} will hold open. * <p> * The effect of this parameter is indirect owning to the semantics of * weak references and the control of the JVM over when they are * cleared. Once an {@link ILocatableResource} becomes weakly reachable, * the JVM will eventually GC the object. Since objects which are * strongly reachable are never cleared, this provides our guarantee * that resources are never closed if they are in use. * * @see #DEFAULT_LOCATOR_CACHE_CAPACITY */ String LOCATOR_CACHE_CAPACITY = Journal.class.getName() + ".locatorCacheCapacity"; String DEFAULT_LOCATOR_CACHE_CAPACITY = "20"; /** * The timeout in milliseconds for stale entries in the * {@link IResourceLocator} cache -or- ZERO (0) to disable the timeout * (default {@value #DEFAULT_LOCATOR_CACHE_TIMEOUT}). When this timeout * expires, the reference for the entry in the backing * {@link HardReferenceQueue} will be cleared. Note that the entry will * remain in the {@link IResourceLocator} cache regardless as long as it * is strongly reachable. */ String LOCATOR_CACHE_TIMEOUT = Journal.class.getName() + ".locatorCacheTimeout"; String DEFAULT_LOCATOR_CACHE_TIMEOUT = "" + (60 * 1000); /** * The #of threads that will be used to read on the local disk. * * @see Journal#getReadExecutor() */ String READ_POOL_SIZE = Journal.class.getName() + ".readPoolSize"; String DEFAULT_READ_POOL_SIZE = "0"; } /** * Create or re-open a journal. * * @param properties * See {@link com.bigdata.journal.Options}. */ public Journal(final Properties properties) { this(properties, null/* quorum */); } public Journal(final Properties properties, final Quorum<HAGlue, QuorumService<HAGlue>> quorum) { super(properties, quorum); isGroupCommit = Boolean.parseBoolean(properties.getProperty( Options.GROUP_COMMIT, Options.DEFAULT_GROUP_COMMIT)); tempStoreFactory = new TemporaryStoreFactory(properties); executorService = (ThreadPoolExecutor) Executors .newCachedThreadPool(new DaemonThreadFactory(getClass() .getName() + ".executorService")); // if (Boolean.valueOf(properties.getProperty( // Options.COLLECT_QUEUE_STATISTICS, // Options.DEFAULT_COLLECT_QUEUE_STATISTICS))) { scheduledExecutorService = Executors .newSingleThreadScheduledExecutor(new DaemonThreadFactory( getClass().getName() + ".sampleService")); // } else { // // scheduledExecutorService = null; // // } { final int readPoolSize = Integer.valueOf(properties.getProperty( Options.READ_POOL_SIZE, Options.DEFAULT_READ_POOL_SIZE)); if (readPoolSize > 0) { readService = new LatchedExecutor(executorService, readPoolSize); } else { readService = null; } } resourceLocator = newResourceLocator(); resourceLockManager = new ResourceLockService(); localTransactionManager = newLocalTransactionManager(); concurrencyManager = new ConcurrencyManager(properties, localTransactionManager, this); getExecutorService().execute(new StartDeferredTasksTask()); if (isGroupCommit() && !(this.isHAJournal()) && getRootBlockView().getCommitCounter() == 0L) { /* * GROUP_COMMIT: See #566. * * This hacks in an initial commit to force the GRS to spring into * existence. Maybe we should just do this for the initial create of * the Journal when using group commit? Or maybe we should make the * AbstractTask smart enough to merge two versions of the GRS - it * should be simple enough to provide write-write conflict resolution * for the GRS. * * Note: This needs to be done by the leader in HA, hence the * conditional test above. */ getGlobalRowStore(); // Force the GRS index to exist. commit(); // Commit. } } /** * Ensure that the WORM mode of the journal always uses * {@link Long#MAX_VALUE} for * {@link AbstractTransactionService.Options#MIN_RELEASE_AGE}. * * @param properties * The properties. * * @return The argument, with the minReleaseAge overridden if necessary. * * @see https://sourceforge.net/apps/trac/bigdata/ticket/391 */ private Properties checkProperties(final Properties properties) { if (getBufferStrategy() instanceof WORMStrategy) { properties.setProperty( AbstractTransactionService.Options.MIN_RELEASE_AGE, "" + Long.MAX_VALUE); } return properties; } /** * Factory for the {@link IResourceLocator} for the {@link Journal}. */ @SuppressWarnings({ "unchecked", "rawtypes" }) protected IResourceLocator<?> newResourceLocator() { final int cacheCapacity = getProperty(Options.LOCATOR_CACHE_CAPACITY, Options.DEFAULT_LOCATOR_CACHE_CAPACITY, IntegerValidator.GT_ZERO); final long cacheTimeout = getProperty(Options.LOCATOR_CACHE_TIMEOUT, Options.DEFAULT_LOCATOR_CACHE_TIMEOUT, LongValidator.GTE_ZERO); return new DefaultResourceLocator(this, null/* delegate */, cacheCapacity, cacheTimeout); } /** * Inner class used to coordinate the distributed protocol for achieving an * atomic consensus on the new <i>releaseTime</i> for the services joined * with a met quorum. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan * Thompson</a> */ private class BarrierState implements Runnable { /** * The token that must remain valid. */ final private long token; /** * The commit counter that will be assigned to the commit point. This is * used to ensure that the GATHER and PREPARE are for the same commit * point and that the follower is at the previous commit point. */ final private long newCommitCounter; /** * The commit time that will be assigned to the commit point. This is * used to ensure that the GATHER and PREPARE are for the same commit * point and that the follower is at the previous commit point. */ final private long newCommitTime; /** * Local HA service implementation (non-Remote). */ final private QuorumService<HAGlue> quorumService; /** The services joined with the met quorum, in their join order. */ final private UUID[] joinedServiceIds; /** * {@link CyclicBarrier} used to coordinate the protocol for achiving an * atomic consensus on the new <i>releaseTime</i> for the services * joined with a met quorum. * <p> * Note: The {@link #barrier} provides visibilty for the fields that are * modified by {@link #run()} so we do not need additional locks or * atomics for synchronizing these state updates. */ final private CyclicBarrier barrier; // /** // * The {@link Future} for the RMI to each follower that is joined with // * the met quorum. // */ // final private Map<UUID, Future<Void>> futures = new HashMap<UUID, Future<Void>>(); /** * A timestamp taken on the leader when we start the protocol to * discover the new releaseTime consensus. */ final private long timestampOnLeader; /** * The {@link UUID} of the quorum leader. */ final private UUID leaderId; /** * This is the earliest visible commit point on the leader. */ final private IHANotifyReleaseTimeRequest leadersValue; /** * Exception is set by {@link #run()} if there is a problem when the * barrier breaks. The exception is then thrown out to the thread on * the leader that is running commitNow(), forcing the commit to fail. */ volatile Throwable cause = null; /** * The message from each of those followers providing their local * earliest visible commit point. * <p> * Note: The {@link ConcurrentHashMap} does NOT allow <code>null</code> * values. Further the {@link IHANotifyReleaseTimeRequest} specifies the * serviceId of the follower. Therefore, a follower whose * {@link GatherTask} fails MUST provide a "mock" * {@link IHANotifyReleaseTimeRequest} that it will use to wait at the * {@link CyclicBarrier}. * * @see InnerJournalTransactionService#notifyEarliestCommitTime(IHANotifyReleaseTimeRequest) * @see GatherTask */ final private Map<UUID, IHANotifyReleaseTimeRequest> followerResponses = new ConcurrentHashMap<UUID, IHANotifyReleaseTimeRequest>(); /** * The value from {@link #followerResponses} associated with the earliest commit * point. This is basis for the "censensus" across the services. */ private IHANotifyReleaseTimeRequest minimumResponse = null; /** * The consensus value. This is a restatement of the data in from the * {@link #minimumResponse}. This is set by {@link #run()}. */ protected volatile IHANotifyReleaseTimeResponse consensus = null; // private Quorum<HAGlue,QuorumService<HAGlue>> getQuorum() { // // return Journal.this.getQuorum(); // // } private HATXSGlue getService(final UUID serviceId) { return quorumService.getService(serviceId); } // /** // * Cancel the requests on the remote services (RMI). This is a best effort // * implementation. Any RMI related errors are trapped and ignored in order // * to be robust to failures in RMI when we try to cancel the futures. // */ // private <F extends Future<T>, T> void cancelRemoteFutures( // final F[] remoteFutures) { // // if (log.isInfoEnabled()) // log.info(""); // // for (F rf : remoteFutures) { // // try { // // rf.cancel(true/* mayInterruptIfRunning */); // // } catch (Throwable t) { // // // ignored (to be robust). // // } // // } // // } /** The services joined with the met quorum, in their join order. */ public BarrierState(final long newCommitCounter, final long newCommitTime, final UUID[] joinedServiceIds) { token = getQuorum().token(); this.newCommitCounter = newCommitCounter; this.newCommitTime = newCommitTime; getQuorum().assertLeader(token); // Local HA service implementation (non-Remote). quorumService = getQuorum().getClient(); this.joinedServiceIds = joinedServiceIds; this.leaderId = quorumService.getServiceId(); leadersValue = ((InnerJournalTransactionService) getTransactionService()) .newHANotifyReleaseTimeRequest(leaderId, newCommitCounter, newCommitTime); // Note: Local method call. timestampOnLeader = leadersValue.getTimestamp(); // /* // * Only the followers will countDown() at the barrier. The leader // * will await() until the barrier breaks. // */ final int nparties = joinedServiceIds.length;// - 1; barrier = new CyclicBarrier(nparties, this); } /** * Find the minimum value across the responses when the {@link #barrier} * breaks. */ @Override public void run() { try { if (log.isInfoEnabled()) log.info("leader: " + leadersValue); // This is the timestamp from the BarrierState ctor. final long timeLeader = leadersValue.getTimestamp(); // This is the timestamp for right now. final long timeNow = newConsensusProtocolTimestamp(); // // The local clock must be moving forward. // assertBefore(timeLeader, timeNow); // Start with the leader's value (from ctor). minimumResponse = leadersValue; for (IHANotifyReleaseTimeRequest response : followerResponses .values()) { if (log.isTraceEnabled()) log.trace("follower: " + response); if (response.isMock()) { /** * The mock response should not have any influence on * the consensus release time. The follower provides the * mock response when it is unable to execute the * GatherTask correctly, typically because it is not yet * HAReady. The mock response preserves liveness for the * GATHER protocol. The follower that provided the mock * response will vote NO for the PREPARE because it's * GatherTask will have thrown out an exception. * * @see <href= * "https://sourceforge.net/apps/trac/bigdata/ticket/720" * > HA3 simultaneous service start failure </a> */ log.warn("Ignoring mock response: " + response); continue; } final UUID followerId = response.getServiceUUID(); if (minimumResponse.getPinnedCommitCounter() > response .getPinnedCommitCounter()) { minimumResponse = response; } /* * Verify that the timestamp from the ctor is BEFORE the * timestamp assigned by the follower in the GatherTask. */ assertBefore(leaderId, followerId, timeLeader, response.getTimestamp()); /* * Verify that the timestamp from the GatherTask on the * follower is before the timestamp obtained at the top of * this run() method. */ assertBefore(followerId, leaderId, response.getTimestamp(), timeNow); } // Restate the consensus as an appropriate message object. consensus = new HANotifyReleaseTimeResponse( minimumResponse.getPinnedCommitTime(), minimumResponse.getPinnedCommitCounter()); if (log.isInfoEnabled()) log.info("consensus: " + consensus); } catch (Throwable t) { // Set the cause. cause = t; } } /** * Task does an RMI to the follower to start the GatherTask on the * follower. */ private final class StartGatherOnFollowerTask implements Callable<Void> { private final UUID serviceId; private final IHAGatherReleaseTimeRequest msg; public StartGatherOnFollowerTask(final UUID serviceId, final IHAGatherReleaseTimeRequest msg) { this.serviceId = serviceId; this.msg = msg; } public Void call() throws Exception { // Resolve joined service. final HATXSGlue service = getService(serviceId); // Message remote service. // Note: NPE if [service] is gone. service.gatherMinimumVisibleCommitTime(msg); // Done. return null; } } // class StartGatherOnFollowerTask /** * Send an {@link IHAGatherReleaseTimeRequest} message to each follower. * Block until the responses are received. * <p> * Note: Like the 2-phase commit, the overall protocol should succeed if * we can get <code>((k+1)/2)</code> services that do not fail this * step. Thus for HA3, we should allow one error on a follower, the * leader is sending the messages and is presumed to succeed, and one * follower COULD fail without failing the protocol. If the protocol * does fail we have to fail the commit, so getting this right is * NECESSARY. At a mimimum, we must not fail if all joined services on * entry to this method respond without failing (that is, succeed if no * services fail during this protocol) - this is implemented. * * @throws InterruptedException * @throws BrokenBarrierException * @throws TimeoutException * * @see <a href="https://sourceforge.net/apps/trac/bigdata/ticket/673" > * Native thread leak in HAJournalServer process </a> */ private void messageFollowers(final long token, final long timeoutNanos) throws IOException, InterruptedException, BrokenBarrierException, TimeoutException { final long begin = System.nanoTime(); final long nanos = timeoutNanos; long remaining = nanos; // Verify that this service is still the leader. getQuorum().assertLeader(token); // /* // * Future for gather task for each follower. // * // * Note: These are asynchronous remote Futures. They must not escape // * the local scope and must be cancelled regardless of the outcome. // * // * Note: DGC for these remote causes a native thread leak on the // * followers. To avoid that, I am attempting to rely on proxies for // * remote futures that do not use DGC. In this case, I believe that // * it will work since the Future is in scope on the follower (it is // * the future for the GatherTask running on the follower) and thus // * we should not need DGC to keep the follower from finalizing the // * remote futures on which this method is relying. // * // * @see https://sourceforge.net/apps/trac/bigdata/ticket/673 // */ // @SuppressWarnings("unchecked") // final Future<Void>[] remoteFutures = new Future[joinedServiceIds.length]; // final boolean[] remoteDone = new boolean[joinedServiceIds.length]; // Local future for RMI requesting GATHER on each follower. final List<Future<Void>> futures = new LinkedList<Future<Void>>(); try { final IHAGatherReleaseTimeRequest msg = new HAGatherReleaseTimeRequest( token, timestampOnLeader, leaderId, newCommitCounter, newCommitTime); // Do not send message to self (leader is at index 0). for (int i = 1; i < joinedServiceIds.length; i++) { final UUID serviceId = joinedServiceIds[i]; /* * Message each follower. * * Note: The invoked RMI method submits the GatherTask that * executes on the follower and returns. It does not block * waiting for the outcome of the task on the follower. * Instead, we wait until the barrier breaks. A thread will * monitor the quorum state and break the barrier if the * quorum breaks or if a joined service leaves during the * consensus protocol. * * Note: This uses multiple threads to issue the requests in * parallel against the followers in order to minimize the * latency of the protocol. */ // Note: throws RejectedExecutionException if shutdown. futures.add(getExecutorService().submit( new StartGatherOnFollowerTask(serviceId, msg))); // // add to list of futures we will check. // remoteFutures[i] = rf; } // /* // * Check the futures for the other services in the quorum. // */ // final List<Throwable> causes = new LinkedList<Throwable>(); // for (int i = 1; i < remoteFutures.length; i++) { // final Future<Void> rf = remoteFutures[i]; // boolean success = false; // try { // rf.get(); // success = true; // remoteDone[i] = true; // } catch (InterruptedException ex) { // log.error(ex, ex); // causes.add(ex); // } catch (ExecutionException ex) { // log.error(ex, ex); // causes.add(ex); // remoteDone[i] = true; // } catch (RuntimeException ex) { // /* // * Note: ClientFuture.get() can throw a RuntimeException // * if there is a problem with the RMI call. In this case // * we do not know whether the Future is done. // */ // log.error(ex, ex); // causes.add(ex); // } finally { // if (!success) { // // Cancel the request on the remote service (RMI). // try { // rf.cancel(true/* mayInterruptIfRunning */); // } catch (Throwable t) { // // ignored. // } // remoteDone[i] = true; // } // } // } // spinWaitBarrier(getQuorum(), barrier, token, timeout, units); /* * This sets up a task that will monitor the quorum state and * then interrupt this Thread if it is blocked at the barrier * [actually, it uses barrier.reset(), which appears to be a * little safer]. * * If this service is no longer the quorum leader or if any of * the services leave that were joined with the met quorum when * we started the release time consensus protocol, then we have * to reset() the barrier. We achieve this by interrupting the * Thread (actually it now uses barrier.reset()). * * Note: CyclicBarrier.await(timeout,unit) causes the barrier to * break if the timeout is exceeded (as opposed to simply throwing the TimeoutException and allowing the thread to * retry the CyclicBarrier.await()). Therefore it CAN NOT be used in preference to this pattern. However, we could replace the use of the CyclicBarrier with a Phaser (JDK 1.7 or jr166). */ { // final Thread blockedAtBarrier = Thread.currentThread(); final Quorum<HAGlue, QuorumService<HAGlue>> quorum = getQuorum(); final long initialDelay = 100; // milliseconds. final long delay = initialDelay; final ScheduledFuture<?> scheduledFuture = scheduledExecutorService .scheduleWithFixedDelay(new Runnable() { public void run() { try { // Verify service is still leader. quorum.assertLeader(token); // Verify service self-recognizes as leader. if (getHAStatus() != HAStatusEnum.Leader) { throw new QuorumException(); } // Verify messaged services still // joined. assertServicesStillJoined(quorum); for (Future<Void> f : futures) { if (f.isDone()) { /* * Note: If any follower fails * on the RMI, then that is * noticed here and the GATHER * will fail on the leader. * * TODO This should be robust as * long as a majority of the * services succeed. Right now * this will stop the GATHER if * any service fails on the RMI. */ f.get(); } } } catch (Throwable ex) { if (InnerCause.isInnerCause(ex, InterruptedException.class)) { // Normal termination. return; } logErrorAndResetBarrier(ex); } } }, initialDelay, delay, TimeUnit.MILLISECONDS); // Update time remaining. remaining = nanos - (System.nanoTime() - begin); try { /* * Throws InterruptedException, BrokenBarrierException, * TimeoutException. * * Note: If TimeoutException is thrown, then the barrier * will be broken. */ barrier.await(remaining, TimeUnit.NANOSECONDS); } finally { scheduledFuture.cancel(true/* mayInterruptIfRunning */); } } // /* // * If there were any errors, then throw an exception listing them. // */ // if (!causes.isEmpty()) { // // Note: Cancelled below. //// // Cancel remote futures. //// cancelRemoteFutures(remoteFutures); // // Throw exception back to the leader. // if (causes.size() == 1) // throw new RuntimeException(causes.get(0)); // throw new RuntimeException("remote errors: nfailures=" // + causes.size(), new ExecutionExceptions(causes)); // } } finally { /* * Cancel local futures for RMI messages to followers. * * Note: Regardless of outcome or errors above, ensure that the * futures used to initiate the GatherTask on the followers are * cancelled. These are local Futures that do RMIs. The RMIs * should not block when they execute on the follower. */ for (Future<Void> f : futures) { /* * Await the Future with a short timeout for the task that * did the RMI to each follow to participate in the GATHER * protocol. Ensure that these Futures are cancelled if they * are not already done, but prefer not to attempt to cancel * the Future if it would be finished if we waited for a few * milliseconds. */ boolean done = false; try { f.get(10, TimeUnit.MILLISECONDS); done = true; } catch (Exception ex) { // IGNORE } finally { if (!done) { // cancel local future. f.cancel(true/* mayInterruptIfRunning */); } } /* * Future is done. Either cancelled, error, or successfully * completed per the code block immediately above. */ try { // check outcome of future. f.get(); } catch (CancellationException e) { /* * There is a race between the code path returning from * the RMI (to request the caller to participate in the * gather procotol) and the code path in which the * leader awakens from the broken barrier (above). We * have explicitly cancelled the future for that RMI and * then checked the Future. If the leader wakes up * before the RMI returns, then the Future will be * cancelled rather than done. This case is a race, not an * error. */ if (log.isInfoEnabled()) log.info(e);// , e); } catch (ExecutionException e) { // Probably error on the RMI. log.error(e, e); } } if (consensus == null) { /* * If there were any followers that did not message the * leader and cause the barrier to be decremented and hence * the [consensus] to become defined, then we need to * decrement the barrier for those followers now in order * for it to break. * * There is no method to decrement by a specific number * (unlike a semaphore), but you can reset() the barrier, * which will cause a BrokenBarrierException for all Threads * waiting on the barrier. * * Note: It appears that [barrier.isBoken()] always reports * [false] here. Hence, the test was changed to * [consensus==null]. See <href= * "https://sourceforge.net/apps/trac/bigdata/ticket/720" > * HA3 simultaneous service start failure </a> * * FIXME HA TXS: A reset() here does not allow us to proceed * with the consensus protocol unless all services * "vote yes". Thus, a single node failure during the * release time consensus protocol will cause the commit to * fail. [Actually, we could use getNumberWaiting(). If it * is a bare majority, then we could take the barrier break * action ourselves. E.g., in the thread that calls * barrier.reset()]. [Actually, this might not be a problem * for cases where the GatherTask is able to send back a * mock IHANotifyReleaseTimeRequest message, only when we * are interrupted by the Runnable above that is monitoring * the quorum state for an invariant change.] */ log.error("Forcing barrier break"); barrier.reset(); } }// finally } // /** // * Wait on the {@link CyclicBarrier}, but do this in a loop so we can // * watch for a quorum break or service leave. // * // * @param quorum // * @param barrier // * @param timeout // * @param units // * // * @throws BrokenBarrierException // * @throws InterruptedException // * @throws TimeoutException // */ // private void spinWaitBarrier( // final Quorum<HAGlue, QuorumService<HAGlue>> quorum, // final CyclicBarrier barrier, final long token, // final long timeout, final TimeUnit unit) // throws BrokenBarrierException, InterruptedException, // TimeoutException { // // if (log.isInfoEnabled()) // log.info("Waiting at barrier: #parties=" + barrier.getParties() // + ", #waiting=" + barrier.getNumberWaiting() // + ", isBroken=" + barrier.isBroken() + ", token=" // + token + ", timeout=" + timeout + ", unit=" + unit); // // // How lock to block in each iteration. // final long blockNanos = TimeUnit.MILLISECONDS.toNanos(10000); // // final long begin = System.nanoTime(); // final long nanos = unit.toNanos(timeout); // long remaining = nanos; // long nspin = 0L; // // try { // // while (remaining > 0) { // // nspin++; // // remaining = nanos - (System.nanoTime() - begin); // // try { // // // Verify that this service remains the leader. // quorum.assertLeader(token); // // // Verify messaged services are still joined. // assertServicesStillJoined(quorum); // // /* // * If we observe a serviceLeave for any service that we // * are awaiting, then we need to stop waiting on that // * service. This could be achieved by running a Thread // * that did a barrier.await() on the behalf of that // * service, but only if that service has not yet // * responded with its input for the consensus protocol // * [if it has responded then it is probably already at // * barrier.await() in a Thread on the leader for that // * follower.] // */ // final long awaitNanos = Math.min(blockNanos, remaining); // // /* // * Await barrier, up to the timeout. // * // * Note: Contrary to the javadoc, barrier.await(timeout) // * will break the barrier if the timeout is exceeded!!! // */ // barrier.await(awaitNanos, TimeUnit.NANOSECONDS); // // // Done. // return; // // } catch (TimeoutException e) { // // Spin. // continue; // } catch (InterruptedException e) { // throw e; // } catch (BrokenBarrierException e) { // throw e; // } // // } // // } finally { // // /* // * Note: On exit, the caller must reset() the barrier if it is // * not yet broken. // */ // // if (log.isInfoEnabled()) // log.info("barrier: #parties=" + barrier.getParties() // + ", #waiting=" + barrier.getNumberWaiting() // + ", isBroken=" + barrier.isBroken() + ", #spin=" // + nspin); // // } // // } private void logErrorAndResetBarrier(final Throwable ex) { log.error(ex, ex); if (!barrier.isBroken()) { log.error("Forcing barrier break"); barrier.reset(); } } /** * Verify that the services that were messaged for the release time * consensus protocol are still joined with the met quorum. * * @throws QuorumException * if one of the joined services leaves. */ private void assertServicesStillJoined( final Quorum<HAGlue, QuorumService<HAGlue>> quorum) throws QuorumException { final UUID[] tmp = quorum.getJoined(); for (UUID serviceId : joinedServiceIds) { boolean found = false; for (UUID t : tmp) { if (serviceId.equals(t)) { found = true; break; } } if (!found) { throw new QuorumException( "Service leave during consensus protocol: " + serviceId); } } } } /** * Note: This deliberately uses the (non-remote) method * {@link BasicHA#nextTimestamp()}. This is done so we can write a unit test * of the {@link GatherTask} that imposes clock skew by overridding the next * value to be returned by that method. */ private long newConsensusProtocolTimestamp() { return ((BasicHA) getQuorum().getClient().getService()).nextTimestamp(); } /** * {@inheritDoc} * <p> * Extends the {@link JournalTransactionService} to provide protection for * the session protection mode of the {@link RWStore} and to support the * {@link HATXSGlue} interface. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan * Thompson</a> * * @see <a href= * "https://docs.google.com/document/d/14FO2yJFv_7uc5N0tvYboU-H6XbLEFpvu-G8RhAzvxrk/edit?pli=1#" * > HA TXS Design Document </a> * * @see <a href="https://sourceforge.net/apps/trac/bigdata/ticket/623" > HA * TXS / TXS Bottleneck </a> */ private class InnerJournalTransactionService extends JournalTransactionService { protected InnerJournalTransactionService() { super(checkProperties(properties), Journal.this); final long lastCommitTime = Journal.this.getLastCommitTime(); if (lastCommitTime != 0L) { /* * Notify the transaction service on startup so it can set the * effective release time based on the last commit time for the * store. * * Note: For HA, the releaseTime is updated by the consensus * protocol once a quorum is met. Before the quorum meets (and * before a service joins with a met quorum) each service will * track its own releaseTime. Therefore, during startup, the * quorum will be null or HAStatusEnum will be NotReady so the * TXS will automatically track the release time until the * service joins with a met quorum. */ if (log.isInfoEnabled()) log.info("Startup: lastCommitTime=" + lastCommitTime); updateReleaseTimeForBareCommit(lastCommitTime); } } /** * @see <a href="http://sourceforge.net/apps/trac/bigdata/ticket/445" > * RWStore does not track tx release correctly </a> */ final private ConcurrentHashMap<Long, IRawTx> m_rawTxs = new ConcurrentHashMap<Long, IRawTx>(); /** * This lock is used to ensure that the following actions are MUTEX: * <ul> * <li>The barrier where we obtain a consensus among the services joined * with the met quorum concerning the new release time.</li> * <li>A remote service that wishes to join an already met quorum.</li> * <li>A new transaction start that would read on a commit point which * is LT than the readsOnCommitTime of the earliestActiveTx for this * service but GT earliest visible commit point for this service (as * determined by the releaseTime on the transaction service).</li> * </ul> * Any of these actions must contend for the {@link #barrierLock}. * * @see #updateReleaseTimeConsensus(long, TimeUnit) * @see GatherTask#call() * @see #newTx(long) * @see #runWithBarrierLock(Runnable) */ final private ReentrantLock barrierLock = new ReentrantLock(); // final private Condition barrierBroke = barrierLock.newCondition(); /** * This is used to coordinate the protocol for achiving an atomic * consensus on the new <i>releaseTime</i> for the services joined with * a met quorum. */ final private AtomicReference<BarrierState> barrierRef = new AtomicReference<BarrierState>(); @Override public void runWithBarrierLock(final Runnable r) { barrierLock.lock(); try { haLog.info("Will run with barrier lock."); try { r.run(); } catch(Throwable t) { /* * Note: An Interrupt here is not really an ERROR. It * could be caused by a change in the RunState of the * HAJournalServer. */ haLog.error(t, t); } finally { haLog.info("Did run with barrier lock."); } } finally { barrierLock.unlock(); } } /** * {@inheritDoc} * <p> * We need obtain a distributed consensus for the services joined with * the met quorum concerning the earliest commit point that is pinned by * the combination of the active transactions and the minReleaseAge on * the TXS. * <p> * New transaction starts during this critical section will block (on * the leader or the folllower) unless they are guaranteed to be * allowable, e.g., based on the current minReleaseAge, the new tx would * read from the most recent commit point, the new tx would ready from a * commit point that is already pinned by an active transaction on that * node, etc. * * @throws IOException * @throws BrokenBarrierException */ // Note: Executed on the leader. @Override public IHANotifyReleaseTimeResponse updateReleaseTimeConsensus( final long newCommitCounter, final long newCommitTime, final UUID[] joinedServiceIds, final long timeout, final TimeUnit units) throws IOException, InterruptedException, TimeoutException, BrokenBarrierException { final long begin = System.nanoTime(); final long nanos = units.toNanos(timeout); long remaining = nanos; final long token = getQuorum().token(); if (haLog.isInfoEnabled()) haLog.info("GATHER PROTOCOL: commitCounter=" + newCommitCounter + ", token=" + token + ", joinedServiceIds=" + Arrays.toString(joinedServiceIds)); final BarrierState barrierState; barrierLock.lock(); try { getQuorum().assertLeader(token); if (!barrierRef.compareAndSet(null/* expectedValue */, barrierState = new BarrierState(newCommitCounter, newCommitTime, joinedServiceIds)/* newValue */)) { throw new IllegalStateException(); } try { /* * Message the followers and block until the barrier breaks. */ // Update time remaining. remaining = nanos - (System.nanoTime() - begin); barrierState.messageFollowers(token, remaining); } finally { // Clear the barrierRef. if (!barrierRef.compareAndSet(barrierState/* expected */, null)) { throw new AssertionError(); } } if (barrierState.cause != null) { /* * If an exception was recorded, re-throw it in the thread * that invoked commitNow(). */ throw new RuntimeException(barrierState.cause); } /** * Update the release time on the leader. * * Note: The follower has the new release time, but it is * running asynchronously and might not have updated its release * time locally by the time the leader leaves the consensus * protocol. prepare2Phase() (on the follower) will check the * Future of the GatherTask and block until it is complete. Thus * all services will be in a state where they are known to have * updated their release time (based on the consensus protocol) * before we finish prepare2Phase() and hence before we run * commit2Phase(). * * @see <a * href="https://sourceforge.net/apps/trac/bigdata/ticket/673" * > Native thread leak in HAJournalServer process </a> */ final IHANotifyReleaseTimeResponse consensus = barrierState.consensus; if (consensus == null) { throw new RuntimeException("No consensus"); } final long consensusValue = consensus.getCommitTime(); final long newReleaseTime = Math.max(0L, consensusValue - 1); if (log.isInfoEnabled()) log.info("Advancing releaseTime on leader: " + newReleaseTime); setReleaseTime(newReleaseTime); return consensus; } finally { barrierLock.unlock(); } } /** * {@inheritDoc} * <p> * Overridden to notice whether this service is using the consensus * protocol to update the releaseTime or updating it automatically as * transactions complete. * * @see <a href= * "https://sourceforge.net/apps/trac/bigdata/ticket/530#comment:116"> * Journal HA </a> */ @Override protected boolean isReleaseTimeConsensusProtocol() { final HAStatusEnum haStatus = getHAStatus(); if (haStatus == null || haStatus == HAStatusEnum.NotReady) { /* * Since we are not HA or this service is not HAReady, we will * not use the consensus protocol to update the releaseTime. * * Therefore the releaseTime is updated here since we will not * (actually, did not) run the consensus protocol to update it. */ return false; } /* * Note: When we are using a 2-phase commit, the leader can not * update the release time from commit() using this methods. It * must rely on the consensus protocol to update the release * time instead. */ return true; } // /** // * {@inheritDoc} // * <p> // * Note: When we are using a 2-phase commit, the leader can not update // * the release time from commit() using this methods. It must rely on // * the consensus protocol to update the release time instead. // * // * @see <a href= // * "https://sourceforge.net/apps/trac/bigdata/ticket/530#comment:116"> // * Journal HA </a> // */ // @Override // protected void updateReleaseTimeForBareCommit(final long commitTime) { // // final HAStatusEnum haStatus = getHAStatus(); // // if (haStatus == null || haStatus == HAStatusEnum.NotReady) { // // /* // * Since we are not HA or this service is not HAReady, we will // * not use the consensus protocol to update the releaseTime. // * // * Therefore the releaseTime is updated here since we will not // * (actually, did not) run the consensus protocol to update it. // */ // super.updateReleaseTimeForBareCommit(commitTime); // // } else { // // /* // * Note: When we are using a 2-phase commit, the leader can not // * update the release time from commit() using this methods. It // * must rely on the consensus protocol to update the release // * time instead. // */ // // } // // } /** * {@inheritDoc} * <p> * Overridden to take the necessary lock since we are invoking this * method from contexts in which the lock would not otherwise be held. * <p> * Note: This is also used to callback on service join to set the * consensus release time from the leader of a newly joined follower. * This ensures that live joiners need not take part in a gather and can * still confidently join in an HA 2 phase commit. * * @param newReleaseTime * The new release time for the local journal. */ @Override public void setReleaseTime(final long newValue) { if (newValue < 0) throw new IllegalArgumentException(); lock.lock(); try { super.setReleaseTime(newValue); } finally { lock.unlock(); } } /** * Factory for {@link IHANotifyReleaseTimeRequest} messages. This is * used by both the leader and the followers. * * @param serviceId * The {@link UUID} for this service. * @return The new message. */ protected IHANotifyReleaseTimeRequest newHANotifyReleaseTimeRequest( final UUID serviceId, final long newCommitCounter, final long newCommitTime) { // On AbstractTransactionService. final long effectiveReleaseTimeForHA = getEffectiveReleaseTimeForHA(); // On AbstractJournal final ICommitRecord commitRecord = getEarliestVisibleCommitRecordForHA(effectiveReleaseTimeForHA); final long commitCounter = commitRecord == null ? 0 : commitRecord.getCommitCounter(); final long commitTime = commitRecord == null ? 0 : commitRecord.getTimestamp(); // final long now = getLocalTransactionManager().nextTimestamp(); final long now = newConsensusProtocolTimestamp(); final IHANotifyReleaseTimeRequest req = new HANotifyReleaseTimeRequest( serviceId, commitTime, commitCounter, now, false/* isMock */, newCommitCounter, newCommitTime); if (log.isTraceEnabled()) log.trace("releaseTime=" + getReleaseTime()// + ",effectiveReleaseTimeForHA=" + effectiveReleaseTimeForHA // + ",rootBlock=" + getRootBlockView() // + ",req=" + req// ); return req; } @Override public Callable<IHANotifyReleaseTimeResponse> newGatherMinimumVisibleCommitTimeTask( final HAGlue leader, final UUID serviceId, final IHAGatherReleaseTimeRequest req) { return new GatherTask(leader, serviceId, req); } /** * {@inheritDoc} * <p> * Note: This method is implemented by {@link AbstractJournal.BasicHA} * which calls through to * {@link #newGatherMinimumVisibleCommitTimeTask(IHAGatherReleaseTimeRequest)} * * @throws UnsupportedOperationException */ @Override public void gatherMinimumVisibleCommitTime( final IHAGatherReleaseTimeRequest req) throws IOException { throw new UnsupportedOperationException(); } /** * "Gather" task runs on the followers. * <p> * Note: The gather task scopes the consensus protocol on the follower. * It contends for the {@link #barrierLock} (on the follower) in order * to be MUTEX with new read-only tx starts on the follower which (a) * occur during the consensus protocol; and (b) would read on a commit * point that is not pinned by any of an active transaction on the * follower, the minReleaseAge, or being the most recent commit point. * These are the criteria that allow {@link #newTx(long)} to NOT contend * for the {@link #barrierLock}. * * @see #newTx(long) */ private class GatherTask implements Callable<IHANotifyReleaseTimeResponse> { /** * The proxy for the leader (the service that made this request). * This is used to RMI back to the leader and therefore MUST be non- * <code>null</code>. */ private final HAGlue leader; /** * The {@link UUID} of <em>this</em> service. This is required as * part of the RMI back to the leader (so the leader knows which * services responded) and therefore MUST be non-<code>null</code>. */ private final UUID serviceId; private final IHAGatherReleaseTimeRequest req; /** * This variable is set in the try {} block in {@link #call()}. We * eventually respond (sending an RMI to the leader) either in the * try{} or in the finally{}, depending on whether or not the * {@link GatherTask} encounters an error when it executes. */ volatile private boolean didNotifyLeader = false; public GatherTask(final HAGlue leader, final UUID serviceId, final IHAGatherReleaseTimeRequest req) { if (leader == null) throw new IllegalArgumentException(); if (serviceId == null) throw new IllegalArgumentException(); if (req == null) throw new IllegalArgumentException(); this.leader = leader; this.serviceId = serviceId; this.req = req; } /** * Note: This needs to be robust to most kinds of errors. However, * if the quorum breaks (leader leaves) of if a follower leaves that * was joined with the met quorum as of the atomic decision point in * commitNow(), then that change will be detected by the leader and * it will break the {@link CyclicBarrier}. */ @Override public IHANotifyReleaseTimeResponse call() throws Exception { if (haLog.isInfoEnabled()) haLog.info("Running gather on follower"); /* * This variable is set in the try {} below. We eventually * respond either in the try{} or in the finally{}, depending on * whether or not the GatherTask encounters an error when it * executes. */ didNotifyLeader = false; try { /* * Test pre-conditions BEFORE getting the barrierLock. This * allows a service that is not yet properly joined to * refuse to do the GATHER before it obtains the barrierLock * that makes the GatherTask MUTEX with * doCastLeadersVoteAndServiceJoin(). */ preconditionTest(); barrierLock.lock(); // take lock on follower! try { // Re-test the pre-conditions. preconditionTest(); return doRunWithBarrierLock(); } finally { barrierLock.unlock(); } } catch (Throwable t) { log.error(t, t); if (!didNotifyLeader) { /** * Send mock response to the leader so it does not block * forever waiting for our response. The mock response * MUST include our correct serviceId. * * @see <href= * "https://sourceforge.net/apps/trac/bigdata/ticket/720" * > HA3 simultaneous service start failure </a> */ try { final IHANotifyReleaseTimeRequest resp = new HANotifyReleaseTimeRequest( serviceId, 0L/* pinnedCommitTime */, 0L/* pinnedCommitCounter */, nextTimestamp()/* timestamp */, true/* isMock */, req.getNewCommitCounter(), req.getNewCommitTime()); log.warn("Sending mock response for gather protocol: cause=" + t); // Will block until barrier breaks on leader. leader.notifyEarliestCommitTime(resp); } catch (Throwable t2) { log.error(t2, t2); } } /* * This exception will force PREPARE to fail on this service * when it checks the GatherTask's Future. */ throw new Exception(t); } } /** * Check various conditions that need to be true. * <p> * Note: We do this once before we take the barrier lock and once * after. We need to do this before we take the barrier lock to * avoid a distributed deadlock when a service is attempting to do * runWithBarrierLock() to join concurrent with the GATHER of a * 2-phase commit. We do it after we take the barrier lock to ensure * that the conditions are still satisified - they are all light * weight tests, but the conditions could become invalidated so it * does not hurt to check again. */ private void preconditionTest() { final long token = req.token(); /* * we do not need to handle the case where the token is * invalid. The leader will reset() the CylicBarrier for * this case. */ // Verify quorum valid for token (implies leader valid) getQuorum().assertQuorum(token); // Verify this service is HAReady for token. assertHAReady(token); /* * If the quorumService is null because this service is * shutting down then the leader will notice the * serviceLeave() and reset() the CyclicBarrier. */ final QuorumService<HAGlue> quorumService = getQuorum() .getClient(); // /* // * This timestamp is used to help detect clock skew. // */ // now = newConsensusProtocolTimestamp(); /* * Note: At this point we have everything we need to form up * our response. If we hit an assertion, we will still * respond in the finally {} block below. */ /* * Note: This assert has been moved to the leader when it * analyzes the messages from the followers. This allows us * to report out the nature of the exception on the leader * and thence back to the client. */ // /* Verify event on leader occurs before event on follower. // */ // assertBefore(req.getTimestampOnLeader(), now); if (!quorumService.isFollower(token)) throw new QuorumException(); final long localCommitCounter = getRootBlockView() .getCommitCounter(); if (req.getNewCommitCounter() != localCommitCounter + 1) { throw new RuntimeException( "leader is preparing for commitCounter=" + req.getNewCommitCounter() + ", but follower is at localCommitCounter=" + localCommitCounter); } } /** * This code is MUTEX with runWithBarrierLock() in HAJournalServer's * doCastLeadersVoteAndJoin(). */ private IHANotifyReleaseTimeResponse doRunWithBarrierLock() throws Exception { final IHANotifyReleaseTimeRequest req2 = newHANotifyReleaseTimeRequest( serviceId, req.getNewCommitCounter(), req.getNewCommitTime()); /* * RMI to leader. * * Note: Will block until barrier breaks on the leader. */ didNotifyLeader = true; final IHANotifyReleaseTimeResponse consensusReleaseTime = leader .notifyEarliestCommitTime(req2); /* * Now spot check the earliest active tx on this follower. We * want to make sure that this tx is not reading against a * commit point whose state would be released by the new * [consensusReleaseTime] that we just obtained from the leader. * * If everything is Ok, we update the releaseTime on the * follower. */ lock.lock(); try { if (log.isInfoEnabled()) log.info("Validating consensus releaseTime on follower: consensus=" + consensusReleaseTime); // the earliest active tx on this follower. final TxState txState = getEarliestActiveTx(); // Consensus for new earliest visible commit time. final long t2 = consensusReleaseTime.getCommitTime(); if (txState != null && txState.getReadsOnCommitTime() < t2) { /* * At least one transaction exists on the follower that * is reading on a commit point LT the commit point * which would be released. This is either a failure in * the logic to compute the consensus releaseTime or a * failure to exclude new transaction starts on the * follower while computing the new consensus * releaseTime. */ throw new AssertionError( "The releaseTime consensus would release a commit point with active readers" + ": consensus=" + consensusReleaseTime + ", earliestActiveTx=" + txState); } final long newReleaseTime = Math.max(0L, consensusReleaseTime.getCommitTime() - 1); if (log.isInfoEnabled()) log.info("Advancing releaseTime on follower: " + newReleaseTime); // Update the releaseTime on the follower setReleaseTime(newReleaseTime); } finally { lock.unlock(); } // Done. return consensusReleaseTime; } // doRunWithBarrierLock } // GatherTask /** * {@inheritDoc} * <p> * Note: Message sent by follower (RMI). Method executes on leader. * <p> * We pass the message through to the {@link BarrierState} object. * <p> * Note: We MUST NOT contend for the {@link #barrierLock} here. That * lock is held by the Thread that invoked * {@link #updateReleaseTimeConsensus()}. */ @Override public IHANotifyReleaseTimeResponse notifyEarliestCommitTime( final IHANotifyReleaseTimeRequest req) throws IOException, InterruptedException, BrokenBarrierException { /* * Note: Do NOT error check [req] until we are in the try{} / * finally {} below that will do the CyclicBarrier.await(). */ final BarrierState barrierState = barrierRef.get(); if (barrierState == null) { /* * If the BarrierState reference has been cleared then it is not * possible for us to count down at the barrier for this message * (since the CyclicBarrier is gone). Otherwise, we will await() * at the CyclicBarrier regardless of the message. */ throw new IllegalStateException(); } try { if (haLog.isInfoEnabled()) haLog.info("resp=" + req); getQuorum().assertLeader(barrierState.token); if (barrierState.newCommitCounter != req.getNewCommitCounter()) { /* * Response is for the wrong GATHER request. */ throw new RuntimeException( "Wrong newCommitCounter: expected=" + barrierState.newCommitCounter + ", actual=" + req.getNewCommitCounter()); } if (barrierState.newCommitTime != req.getNewCommitTime()) { /* * Response is for the wrong GATHER request. */ throw new RuntimeException("Wrong newCommitTime: expected=" + barrierState.newCommitTime + ", actual=" + req.getNewCommitTime()); } // ServiceId of the follower (NPE if req is null). final UUID followerId = req.getServiceUUID(); // Make a note of the message from this follower. barrierState.followerResponses.put(followerId, req); } catch(RuntimeException e) { /* * Note: The try {} block can throw RuntimeException but not * Exception. If anything is thrown, then reset the barrier and * rethrow the exception. */ haLog.error(e, e); // Reset the barrier (barrier will break). barrierState.barrier.reset(); // Rethrow the exception. throw new RuntimeException(e); } finally { /* * Block until barrier breaks. * * Note: The barrier will break immediately if it was reset in * the catch{} block above. */ try { if (haLog.isInfoEnabled()) { haLog.info("Awaiting barrier: #followerResponses=" + barrierState.followerResponses.size() + ", #parties=" + barrierState.barrier.getParties() + ", #joinedUUIDs=" + barrierState.joinedServiceIds.length); } } finally { /* * Follower blocks on Thread on the leader here. * * Note: This will thrown InterruptedException -or- * BarrierBrokenException if the barrier is reset(). */ barrierState.barrier.await(); } } /* * Check for an error in the consensus protocol. */ final Throwable t = barrierState.cause; if (t != null) { /* * Log error. */ haLog.error(t, t); // rethrow cause. throw new RuntimeException(t); } // Return the consensus. final IHANotifyReleaseTimeResponse resp = barrierState.consensus; if (resp == null) { /* * Log error, but return anyway. */ haLog.error("No consensus"); } return resp; } /** * Helper method returns the {@link HAStatusEnum} -or- <code>null</code> * if this is not HA or if the {@link Quorum} is not running. This is a * <em>low latency local</em> method call. The code path is against the * local (non-remote) HAGlue object. It is NOT an RMI. * * @return The {@link HAStatusEnum} or <code>null</code>. */ private final HAStatusEnum getHAStatus() { // Quorum iff HA. final Quorum<HAGlue, QuorumService<HAGlue>> quorum = getQuorum(); if(quorum == null) { // Not HA. return null; } // Note: This is the local service interface. final HAGlue localService; try { localService = quorum.getClient().getService(); } catch (IllegalStateException ex) { /* * Quorum client is not running (not started or terminated). */ return null; } // Note: Invocation against local HAGlue object (NOT RMI). try { return localService.getHAStatus(); } catch (IOException ex) { // Note: Exception is never thrown (not RMI). throw new RuntimeException(ex); } } @Override public long newTx(final long timestamp) { if (TimestampUtility.isReadWriteTx(timestamp)) { // The caller has provided a TxId, not a timestamp. throw new IllegalArgumentException(); } // The HAStatusEnum -or- null if not HA. final HAStatusEnum haStatus = getHAStatus(); if (haStatus == null) { // Not HA. return _newTx(timestamp); } if (haStatus == HAStatusEnum.NotReady) { // Not ready. throw new QuorumException(); } if (timestamp == ITx.UNISOLATED && haStatus != HAStatusEnum.Leader) { // Read/Write Tx starts are only allowed on the Leader. throw new QuorumException("Not quorum leader"); } if (timestamp == ITx.UNISOLATED || timestamp == ITx.READ_COMMITTED) { /* * A read-write tx reads on the current commit point. * * A read-committed tx reads on the current commit point. * * The current commit point is always visible, so these requests * are non-blocking. * * Note: We have verified that this service is the quorum leader * above if the request is for a read-write tx. */ return _newTx(timestamp); } /* * The request is a read-only tx against some specific historical * commit point. It will be allowed (without blocking at the * barrier) if the commit point is known to be pinned based on * either the minReleaseAge or the earliestActiveTx. We use the * AbstractTransactionService's lock to make these inspections * atomic. */ lock.lock(); // Note: AbstractTransactionService.lock try { final long now = nextTimestamp(); final long minReleaseAge = getMinReleaseAge(); final long ageOfTxView = now - timestamp; if (ageOfTxView < minReleaseAge) { // Start tx. Commit point pinned by minReleaseAge. return _newTx(timestamp); } /* * Handle commit point pinned by earliestActiveTx's * readsOnCommitTime. */ { final TxState state = getEarliestActiveTx(); if (state != null && state.getReadsOnCommitTime() <= timestamp) { // Start Tx. Commit point pinned by earliestActiveTx. return _newTx(timestamp); } } final IRootBlockView rootBlock = getRootBlockView(); if (rootBlock.getCommitCounter() == 0L) { // Start Tx. No commits so nothing could be released. return _newTx(timestamp); } if (rootBlock.getLastCommitTime() <= timestamp) { // Tx reads on most recent commit point. return _newTx(timestamp); } } finally { lock.unlock(); } /* * Must block at barrier. */ barrierLock.lock(); try { if (log.isInfoEnabled()) log.info("NewTx with barrierLock"); return _newTx(timestamp); } finally { barrierLock.unlock(); } } /** * Core impl. * <p> * This code pre-increments the active transaction count within the * RWStore before requesting a new transaction from the transaction * service. This ensures that the RWStore does not falsely believe * that there are no open transactions during the call to * AbstractTransactionService#newTx(). * <p> * Note: This code was moved into the inner class extending the * {@link JournalTransactionService} in order to ensure that we * follow this pre-incremental pattern for an {@link HAJournal} as * well. * * @see <a * href="https://sourceforge.net/apps/trac/bigdata/ticket/440#comment:13"> * BTree can not be case to Name2Addr </a> * @see <a * href="https://sourceforge.net/apps/trac/bigdata/ticket/530"> * Journal HA </a> */ private final long _newTx(final long timestamp) { IRawTx tx = null; try { final IBufferStrategy bufferStrategy = getBufferStrategy(); if (bufferStrategy instanceof IHistoryManager) { // pre-increment the active tx count. tx = ((IHistoryManager) bufferStrategy).newTx(); } return super.newTx(timestamp); } finally { if (tx != null) { /* * If we had pre-incremented the transaction counter in * the RWStore, then we decrement it before leaving this * method. */ tx.close(); } } } @Override public long commit(final long tx) { final TxState state = getTxState(tx); final Quorum<HAGlue, QuorumService<HAGlue>> quorum = getQuorum(); if (quorum != null && state != null && !state.isReadOnly()) { /* * Commit on write transaction. We must be the quorum leader. */ final long token = getQuorumToken(); getQuorum().assertLeader(token); } return super.commit(tx); } @Override protected void activateTx(final TxState state) { if (txLog.isInfoEnabled()) txLog.info("OPEN : txId=" + state.tx + ", readsOnCommitTime=" + state.getReadsOnCommitTime()); final IBufferStrategy bufferStrategy = Journal.this.getBufferStrategy(); if (bufferStrategy instanceof IHistoryManager) { final IRawTx tx = ((IHistoryManager)bufferStrategy).newTx(); if (m_rawTxs.put(state.tx, tx) != null) { throw new IllegalStateException( "Unexpected existing RawTx"); } } super.activateTx(state); } @Override protected void deactivateTx(final TxState state) { if (txLog.isInfoEnabled()) txLog.info("CLOSE: txId=" + state.tx + ", readsOnCommitTime=" + state.getReadsOnCommitTime()); /* * Note: We need to deactivate the tx before RawTx.close() is * invoked otherwise the activeTxCount will never be zero inside * of RawTx.close() and the session protection mode of the * RWStore will never be able to release storage. */ super.deactivateTx(state); final IRawTx tx = m_rawTxs.remove(state.tx); if (tx != null) { tx.close(); } } /** * Extended to cancel any running or queued tasks on the {@link WriteExecutorService}. * * @see <a href="http://trac.blazegraph.com/ticket/753" > HA doLocalAbort() * should interrupt NSS requests and AbstractTasks </a> */ @Override public void abortAllTx() { super.abortAllTx(); concurrencyManager.abortAllTx(); } } // class InnerJournalTransactionService protected JournalTransactionService newTransactionService() { final JournalTransactionService abstractTransactionService = new InnerJournalTransactionService(); return abstractTransactionService; } protected AbstractLocalTransactionManager newLocalTransactionManager() { final JournalTransactionService abstractTransactionService = newTransactionService(); abstractTransactionService.start(); return new AbstractLocalTransactionManager() { @Override public AbstractTransactionService getTransactionService() { return abstractTransactionService; } /** * Extended to shutdown the embedded transaction service. */ @Override public void shutdown() { ((JournalTransactionService) getTransactionService()) .shutdown(); super.shutdown(); } /** * Extended to shutdown the embedded transaction service. */ @Override public void shutdownNow() { ((JournalTransactionService) getTransactionService()) .shutdownNow(); super.shutdownNow(); } }; } @Override public AbstractLocalTransactionManager getLocalTransactionManager() { return localTransactionManager; } @Override public boolean isGroupCommit() { return isGroupCommit; } /** * Interface defines and documents the counters and counter namespaces * reported by the {@link Journal} and the various services which it uses. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan * Thompson</a> */ public static interface IJournalCounters extends ConcurrencyManager.IConcurrencyManagerCounters, // ...TransactionManager.XXXCounters, ResourceManager.IResourceManagerCounters { /** * The namespace for the counters pertaining to the {@link ConcurrencyManager}. */ String concurrencyManager = "Concurrency Manager"; /** * The namespace for the counters pertaining to the named indices. */ String indexManager = "Index Manager"; /** * The namespace for the counters pertaining to the {@link ILocalTransactionService}. */ String transactionManager = "Transaction Manager"; /** * The namespace for counters pertaining to the * {@link Journal#getExecutorService()}. */ String executorService = "Executor Service"; /** * Performance counters for the query engine associated with this * journal (if any). */ String queryEngine = "Query Engine"; } /** * {@inheritDoc} * <p> * Overridden to attach additional performance counters. */ @Override public CounterSet getCounters() { final CounterSet root = new CounterSet(); // Host wide performance counters (collected from the OS). { final AbstractStatisticsCollector t = getPlatformStatisticsCollector(); if (t != null) { root.attach(t.getCounters()); } } // JVM wide performance counters. { final CounterSet tmp = root.makePath("JVM"); tmp.attach(AbstractStatisticsCollector.getMemoryCounterSet()); } // Journal performance counters. { final CounterSet tmp = root.makePath("Journal"); tmp.attach(super.getCounters()); // Live index counters iff available. { final CounterSet indexCounters = getIndexCounters(); if (indexCounters != null) { tmp.makePath(IJournalCounters.indexManager).attach( indexCounters); } } tmp.makePath(IJournalCounters.concurrencyManager) .attach(concurrencyManager.getCounters()); tmp.makePath(IJournalCounters.transactionManager) .attach(localTransactionManager.getCounters()); { final IPlugIn<Journal, ThreadPoolExecutorBaseStatisticsTask> plugin = pluginQueueStats .get(); if (plugin != null) { final ThreadPoolExecutorBaseStatisticsTask t = plugin .getService(); if (t != null) { tmp.makePath(IJournalCounters.executorService).attach( t.getCounters()); } } } } // Lookup an existing query engine, but do not cause one to be created. final QueryEngine queryEngine = QueryEngineFactory.getInstance() .getExistingQueryController(this); if (queryEngine != null) { final CounterSet tmp = root.makePath(IJournalCounters.queryEngine); tmp.attach(queryEngine.getCounters()); } return root; } /* * IResourceManager */ @Override public File getTmpDir() { return tmpDir; } /** * The directory in which the journal's file is located -or- * <code>null</code> if the journal is not backed by a file. */ @Override public File getDataDir() { final File file = getFile(); if (file == null) { return null; } return file.getParentFile(); } /** * Note: This will only succeed if the <i>uuid</i> identifies <i>this</i> * journal. */ public IRawStore openStore(final UUID uuid) { if(uuid == getRootBlockView().getUUID()) { return this; } throw new UnsupportedOperationException(); } /** * Always returns an array containing a single {@link BTree} which is the * {@link BTree} loaded from the commit record whose commit timestamp is * less than or equal to <i>timestamp</i> -or- <code>null</code> if there * are no {@link ICommitRecord}s that satisfy the probe or if the named * index was not registered as of that timestamp. * * @param name * @param timestamp * * @throws UnsupportedOperationException * If the <i>timestamp</i> is {@link ITx#READ_COMMITTED}. You * MUST use {@link #getIndex(String, long)} in order to obtain a * view that has {@link ITx#READ_COMMITTED} semantics. */ public AbstractBTree[] getIndexSources(final String name, final long timestamp) { final BTree btree; if (timestamp == ITx.UNISOLATED) { /* * Unisolated operation on the live index. */ // MAY be null. btree = getIndex(name); } else if (timestamp == ITx.READ_COMMITTED) { /* * BTree does not know how to update its view with intervening * commits. Further, for a variety of reasons including the * synchronization problems that would be imposed, there are no * plans for BTree to be able to provide read-committed semantics. * Instead a ReadCommittedView is returned by * getIndex(name,timestamp) when ITx#READ_COMMITTED is requested and * this method is not invoked. */ throw new UnsupportedOperationException("Read-committed view"); // /* // * Read committed operation against the most recent commit point. // * // * Note: This commit record is always defined, but that does not // * mean that any indices have been registered. // */ // // final ICommitRecord commitRecord = getCommitRecord(); // // final long ts = commitRecord.getTimestamp(); // // if (ts == 0L) { // // log.warn("Nothing committed: name="+name+" - read-committed operation."); // // return null; // // } // // // MAY be null. // btree = getIndex(name, commitRecord); // // if (btree != null) { // //// /* //// * Mark the B+Tree as read-only. //// */ //// //// btree.setReadOnly(true); // // assert ((BTree) btree).getLastCommitTime() != 0; //// btree.setLastCommitTime(commitRecord.getTimestamp()); // // } } else { /* * A specified historical index commit point. * * @see <a * href="http://sourceforge.net/apps/trac/bigdata/ticket/546" > Add * cache for access to historical index views on the Journal by name * and commitTime. </a> */ final long ts = Math.abs(timestamp); // final ICommitRecord commitRecord = getCommitRecord(ts); // // if (commitRecord == null) { // // log.warn("No commit record: name=" + name + ", timestamp=" + ts); // // return null; // // } // // // MAY be null // btree = getIndex(name, commitRecord); // MAY be null btree = (BTree) super.getIndex(name, ts); if (btree != null) { // /* // * Mark the B+Tree as read-only. // */ // // btree.setReadOnly(true); assert btree.getLastCommitTime() != 0; // btree.setLastCommitTime(commitRecord.getTimestamp()); } } /* * No such index as of that timestamp. */ if (btree == null) { if (log.isInfoEnabled()) log.info("No such index: name=" + name + ", timestamp=" + timestamp); return null; } return new AbstractBTree[] { btree }; } /** * Always returns <i>this</i>. */ final public AbstractJournal getLiveJournal() { return this; } /** * Always returns <i>this</i>. */ final public AbstractJournal getJournal(final long timestamp) { return this; } /** * Compacts the named indices found on this journal as of the most recent * commit point, writing their view onto a new Journal. This method MAY be * used concurrently with the {@link Journal} but writes after the selected * commit point WILL NOT be reflected in the output file. Typical uses are * to reduce the space required by the backing store, to improve locality in * the backing store, and to make a backup of the most recent commit point. * * @param outFile * The file on which the new journal will be created. * * @return The {@link Future} on which you must {@link Future#get() wait} * for the {@link CompactTask} to complete. The already open journal * is accessible using {@link Future#get()}. If you are backing up * data, then be sure to shutdown the returned {@link Journal} so * that it can release its resources. */ public Future<Journal> compact(final File outFile) { return executorService.submit(new CompactTask(this, outFile, getLastCommitTime())); } /** * Submit a task that will take a snapshot of the journal and return the * {@link Future} for that task. The snapshot is taken on a temporary file. * Iff the snapshot is successful, the temporary file is renamed to the * application determined file. Thus all snapshots are either valid or are * were not written. A snapshot of an empty journal is not permitted. Also, * the backing store MUST implement the {@link IHABufferStrategy}. * <p> * Note: This method supports application controlled snapshots and is * primarily intended for non-HA deployments. HA has an integrated snapshot * and transaction log mechanism which is preferred in HA deployments and * also provides the ability for an application to take snapshots on demand. * * @param snapshotFactory * The factory that will provide the name of the file on which the * snapshot will be written. * * @return The {@link Future} for the snapshot. * * @throws UnsupportedOperationException * if the backing store does not implement the * {@link IHABufferStrategy} interface. * * @see <a href="http://trac.bigdata.com/ticket/1172"> Online backup for * Journal </a> * @since 1.5.2 */ public Future<ISnapshotResult> snapshot( final ISnapshotFactory snapshotFactory) { if (!(getBufferStrategy() instanceof IHABufferStrategy)) { throw new UnsupportedOperationException(); } return executorService.submit(new SnapshotTask(this, snapshotFactory)); } @Override public void dropIndex(final String name) { final BTreeCounters btreeCounters = getIndexCounters(name); super.dropIndex(name); if (btreeCounters != null) { // Conditionally remove the counters for the old index. indexCounters.remove(name, btreeCounters); } } /** * {@inheritDoc} * <p> * Note: {@link ITx#READ_COMMITTED} views are given read-committed semantics * using a {@link ReadCommittedView}. This means that they can be cached * since the view will update automatically as commits are made against * the {@link Journal}. * * @see IndexManager#getIndex(String, long) */ @Override public ILocalBTreeView getIndex(final String name, final long timestamp) { if (name == null) { throw new IllegalArgumentException(); } final boolean isReadWriteTx = TimestampUtility.isReadWriteTx(timestamp); // final Tx tx = (Tx) (isReadWriteTx ? getConcurrencyManager() // .getTransactionManager().getTx(timestamp) : null); final Tx tx = (Tx) /*getConcurrencyManager().*/getTransactionManager() .getTx(timestamp); if (isReadWriteTx) { if (tx == null) { log.warn("Unknown transaction: name=" + name + ", tx=" + timestamp); return null; } tx.lock.lock(); try { if (!tx.isActive()) { // typically this means that the transaction has already // prepared. log.warn("Transaction not active: name=" + name + ", tx=" + timestamp + ", prepared=" + tx.isPrepared() + ", complete=" + tx.isComplete() + ", aborted=" + tx.isAborted()); return null; } } finally { tx.lock.unlock(); } } if( isReadWriteTx && tx == null ) { /* * Note: This will happen both if you attempt to use a transaction * identified that has not been registered or if you attempt to use * a transaction manager after the transaction has been either * committed or aborted. */ log.warn("No such transaction: name=" + name + ", tx=" + timestamp); return null; } final boolean readOnly = TimestampUtility.isReadOnly(timestamp); // final boolean readOnly = (timestamp < ITx.UNISOLATED) // || (isReadWriteTx && tx.isReadOnly()); final ILocalBTreeView tmp; if (isReadWriteTx) { /* * Isolated operation. * * Note: The backing index is always a historical state of the named * index. * * Note: Tx.getIndex() will pass through the actual commit time of * the ground state against which the transaction is reading (if it * is available, which it is on the local Journal). * * @see <a * href="https://sourceforge.net/apps/trac/bigdata/ticket/266"> * Refactor native long tx id to thin object</a> */ final ILocalBTreeView isolatedIndex = tx.getIndex(name); if (isolatedIndex == null) { log.warn("No such index: name=" + name + ", tx=" + timestamp); return null; } tmp = isolatedIndex; } else { /* * Non-transactional view. */ if (readOnly) { if (timestamp == ITx.READ_COMMITTED) { // read-committed tmp = new ReadCommittedView(this, name); } else { if (tx != null) { /* * read-only transaction * * @see <a href= * "http://sourceforge.net/apps/trac/bigdata/ticket/546" * > Add cache for access to historical index views on * the Journal by name and commitTime. </a> */ final AbstractBTree[] sources = getIndexSources(name, tx.getReadsOnCommitTime()); if (sources == null) { log.warn("No such index: name=" + name + ", timestamp=" + timestamp); return null; } assert sources[0].isReadOnly(); tmp = (BTree) sources[0]; } else { // historical read not protected by a transaction final AbstractBTree[] sources = getIndexSources(name, timestamp); if (sources == null) { log.warn("No such index: name=" + name + ", timestamp=" + timestamp); return null; } assert sources[0].isReadOnly(); tmp = (BTree) sources[0]; } } } else { /* * Writable unisolated index. * * Note: This is the "live" mutable index. This index is NOT * thread-safe. A lock manager is used to ensure that at most * one task has access to this index at a time. */ assert timestamp == ITx.UNISOLATED; final AbstractBTree[] sources = getIndexSources(name, ITx.UNISOLATED); if (sources == null) { if (log.isInfoEnabled()) log.info("No such index: name="+name+", timestamp="+timestamp); return null; } assert ! sources[0].isReadOnly(); tmp = (BTree) sources[0]; } } /* * Make sure that it is using the canonical counters for that index. * * Note: AbstractTask also does this for UNISOLATED indices which it * loads by itself as part of providing ACID semantics for add/drop * of indices. */ tmp.getMutableBTree().setBTreeCounters(getIndexCounters(name)); return tmp; } /** * Always returns the {@link BTree} as the sole element of the array since * partitioned indices are not supported. */ @Override public AbstractBTree[] getIndexSources(final String name, final long timestamp, final BTree btree) { return new AbstractBTree[] { btree }; } /** * Create a new transaction on the {@link Journal}. * <p> * Note: This is a convenience method. The implementation of this method is * delegated to the object returned by {@link #getTransactionService()}. * * @param timestamp * A positive timestamp for a historical read-only transaction as * of the first commit point LTE the given timestamp, * {@link ITx#READ_COMMITTED} for a historical read-only * transaction as of the most current commit point on the * {@link Journal} as of the moment that the transaction is * created, or {@link ITx#UNISOLATED} for a read-write * transaction. * * @return The transaction identifier. * * @see ITransactionService#newTx(long) */ final public long newTx(final long timestamp) { /* * Note: The RWStore native tx pre-increment logic is now handled by * _newTx() in the inner class that extends JournalTransactionService. */ try { return getTransactionService().newTx(timestamp); } catch (IOException ioe) { /* * Note: IOException is declared for RMI but will not be thrown * since the transaction service is in fact local. */ throw new RuntimeException(ioe); } } /** * Abort a transaction. * <p> * Note: This is a convenience method. The implementation of this method is * delegated to the object returned by {@link #getTransactionService()}. * * @param tx * The transaction identifier. * * @see ITransactionService#abort(long) */ final public void abort(final long tx) { try { /* * Note: TransactionService will make call back to the * localTransactionManager to handle the client side of the * protocol. */ getTransactionService().abort(tx); } catch (IOException e) { /* * Note: IOException is declared for RMI but will not be thrown * since the transaction service is in fact local. */ throw new RuntimeException(e); } } /** * Commit a transaction. * <p> * Note: This is a convenience method. The implementation of this method is * delegated to the object returned by {@link #getTransactionService()}. * * @param tx * The transaction identifier. * * @return The commit time assigned to that transaction. * * @see ITransactionService#commit(long) */ final public long commit(final long tx) throws ValidationError { try { /* * Note: TransactionService will make call back to the * localTransactionManager to handle the client side of the * protocol. */ return getTransactionService().commit(tx); } catch (IOException e) { /* * Note: IOException is declared for RMI but will not be thrown * since the transaction service is in fact local. */ throw new RuntimeException(e); } } /** * Validate the write set for a transaction. This operation is not required. * Validation will be performed during commit processing for a transaction * regardless. * * @param txId * The transaction identifier. * * @return <code>true</code> iff the write set of the transaction could be * validated. * * @throws TransactionNotFoundException * if no such transaction exists. */ final public boolean prepare(final long txId) { final Tx localState = getLocalTransactionManager().getTx(txId); if (localState == null) throw new TransactionNotFoundException(txId); if (localState.isReadOnly()) { // Trivally validated. return true; } try { final AbstractTask<Boolean> task = new ValidateWriteSetTask( concurrencyManager, getLocalTransactionManager(), localState); /* * Submit the task and wait for the result. * * Note: This task MUST go through the ConcurrencyManager to obtain its * locks. */ final boolean ok = concurrencyManager.submit(task).get(); return ok; } catch (Exception ex) { throw new RuntimeException(ex); } } // /** // * @deprecated This method in particular should be hidden from the // * {@link Journal} as it exposes the {@link ITx} which really // * deals with the client-side state of a transaction and which // * should not be visible to applications - they should just use // * the [long] transaction identifier. // */ // public ITx getTx(long startTime) { // // return localTransactionManager.getTx(startTime); // // } /** * Returns the next timestamp from the {@link ILocalTransactionManager}. * <p> * Note: This is a convenience method. The implementation of this method is * delegated to the object returned by {@link #getTransactionService()}. * * @deprecated This is here for historical reasons and is only used by the * test suite. Use {@link #getLocalTransactionManager()} and * {@link ITransactionService#nextTimestamp()}. * * @see ITransactionService#nextTimestamp() */ final public long nextTimestamp() { return localTransactionManager.nextTimestamp(); } /* * IConcurrencyManager */ public ConcurrencyManager getConcurrencyManager() { return concurrencyManager; } /** * Note: The transaction service is shutdown first, then the * {@link #executorService}, then the {@link IConcurrencyManager}, the * {@link ITransactionService} and finally the {@link IResourceLockService}. */ @Override synchronized public void shutdown() { if (!isOpen()) return; /* * Shutdown the transaction service. This will not permit new * transactions to start and will wait until running transactions either * commit or abort. */ localTransactionManager.shutdown(); { final IPlugIn<?, ?> plugIn = pluginGanglia.get(); if (plugIn != null) { // stop if running. plugIn.stopService(false/* immediateShutdown */); } } { final IPlugIn<?, ?> plugIn = pluginQueueStats.get(); if (plugIn != null) { // stop if running. plugIn.stopService(false/* immediateShutdown */); } } { final IPlugIn<?, ?> plugIn = pluginPlatformStats.get(); if (plugIn != null) { // stop if running. plugIn.stopService(false/* immediateShutdown */); } } if (scheduledExecutorService != null) { scheduledExecutorService.shutdown(); } // optional httpd service for the local counters. { final IPlugIn<?, ?> plugIn = pluginHttpd.get(); if (plugIn != null) { // stop if running. plugIn.stopService(false/* immediateShutdown */); } } /* * Shutdown the executor service. This will wait for any tasks being run * on that service by the application to complete. */ try { new ShutdownHelper(executorService, 1000/* logTimeout */, TimeUnit.MILLISECONDS) { @Override protected void logTimeout() { log.warn("Waiting on task(s)" + ": elapsed=" + TimeUnit.NANOSECONDS.toMillis(elapsed()) + "ms, #active=" + ((ThreadPoolExecutor) executorService) .getActiveCount()); } }; } catch (InterruptedException ex) { log.warn("Immediate shutdown: "+ex); // convert to immediate shutdown. shutdownNow(); return; } /* * Shutdown the concurrency manager - this will allow existing * non-transactional operations to complete but prevent additional * operations from starting. */ concurrencyManager.shutdown(); super.shutdown(); } /** * Note: The {@link IConcurrencyManager} is shutdown first, then the * {@link ITransactionService} and finally the {@link IResourceManager}. */ @Override synchronized public void shutdownNow() { if (!isOpen()) return; /* * Note: The ganglia plug in is executed on the main thread pool. We * need to terminate it in order for the thread pool to shutdown. */ { final IPlugIn<?, ?> plugIn = pluginGanglia.get(); if (plugIn != null) { // stop if running. plugIn.stopService(true/* immediateShutdown */); } } { final IPlugIn<?, ?> plugIn = pluginQueueStats.get(); if (plugIn != null) { // stop if running. plugIn.stopService(true/* immediateShutdown */); } } { final IPlugIn<?, ?> plugIn = pluginPlatformStats.get(); if (plugIn != null) { // stop if running. plugIn.stopService(true/* immediateShutdown */); } } if (scheduledExecutorService != null) scheduledExecutorService.shutdownNow(); // optional httpd service for the local counters. { final IPlugIn<?, ?> plugIn = pluginHttpd.get(); if (plugIn != null) { // stop if running. plugIn.stopService(false/* immediateShutdown */); } } // Note: can be null if error in ctor. if (executorService != null) executorService.shutdownNow(); // Note: can be null if error in ctor. if (concurrencyManager != null) concurrencyManager.shutdownNow(); // Note: can be null if error in ctor. if (localTransactionManager != null) localTransactionManager.shutdownNow(); super.shutdownNow(); } // public void deleteResources() { // // super.deleteResources(); // // // Note: can be null if error in ctor. // if (tempStoreFactory != null) // tempStoreFactory.closeAll(); // // } /** * {@inheritDoc} * <p> * Overridden to close the {@link TemporaryStoreFactory}. */ @Override protected void _close() { super._close(); // Note: can be null if error in ctor. if (tempStoreFactory != null) tempStoreFactory.closeAll(); } @Override public <T> FutureTask<T> submit(AbstractTask<T> task) { return concurrencyManager.submit(task); } @Override public <T> List<Future<T>> invokeAll( final Collection<? extends AbstractTask<T>> tasks, final long timeout, final TimeUnit unit) throws InterruptedException { return concurrencyManager.invokeAll(tasks, timeout, unit); } @Override public <T> List<Future<T>> invokeAll( Collection<? extends AbstractTask<T>> tasks) throws InterruptedException { return concurrencyManager.invokeAll(tasks); } @Override public IResourceManager getResourceManager() { return concurrencyManager.getResourceManager(); } @Override public ILocalTransactionManager getTransactionManager() { // return concurrencyManager.getTransactionManager(); return localTransactionManager; } public ITransactionService getTransactionService() { // return getTransactionManager().getTransactionService(); return localTransactionManager.getTransactionService(); } @Override public WriteExecutorService getWriteService() { return concurrencyManager.getWriteService(); } /* * IResourceManager */ /** * Note: This implementation always returns <code>false</code>. As a * consequence the journal capacity will simply be extended by * {@link #write(ByteBuffer)} until the available disk space is exhausted. * * @return This implementation returns <code>false</code> since overflow * is NOT supported. */ @Override public boolean shouldOverflow() { return false; } /** * Note: This implementation always returns <code>false</code>. */ @Override public boolean isOverflowEnabled() { return false; } @Override public Future<Object> overflow() { throw new UnsupportedOperationException(); } // /** // * This request is always ignored for a {@link Journal} since it does not // * have any resources to manage. // */ // public void setReleaseTime(final long releaseTime) { // // if (releaseTime < 0L) { // // // Not a timestamp. // throw new IllegalArgumentException(); // // } // // // ignored. // // } /** * @throws UnsupportedOperationException * since {@link #overflow()} is not supported. */ @Override public File getIndexSegmentFile(IndexMetadata indexMetadata) { throw new UnsupportedOperationException(); } /** * @throws UnsupportedOperationException * always. */ @Override public IBigdataFederation<?> getFederation() { throw new UnsupportedOperationException(); } /** * @throws UnsupportedOperationException * always. */ @Override public DataService getDataService() { throw new UnsupportedOperationException(); } /** * @throws UnsupportedOperationException * always. */ @Override public UUID getDataServiceUUID() { throw new UnsupportedOperationException(); } /** * Always returns <code>null</code> since index partition moves are not * supported. */ @Override public StaleLocatorReason getIndexPartitionGone(String name) { return null; } /* * global row store. */ @Override public SparseRowStore getGlobalRowStore() { return getGlobalRowStoreHelper().getGlobalRowStore(); } // /** // * Return a view of the global row store as of the specified timestamp. This // * is mainly used to provide access to historical views. // * // * @param timestamp // * The specified timestamp. // * // * @return The global row store view -or- <code>null</code> if no view // * exists as of that timestamp. // */ @Override public SparseRowStore getGlobalRowStore(final long timestamp) { return getGlobalRowStoreHelper().get(timestamp); } /** * Return the {@link GlobalRowStoreHelper}. * <p> * Note: An atomic reference provides us with a "lock" object which doubles * as a reference. We are not relying on its CAS properties. */ private final GlobalRowStoreHelper getGlobalRowStoreHelper() { GlobalRowStoreHelper t = globalRowStoreHelper.get(); if (t == null) { synchronized (globalRowStoreHelper) { /* * Note: Synchronized to avoid race conditions when updating * (this allows us to always return our reference if we create a * new helper instance). */ t = globalRowStoreHelper.get(); if (t == null) { globalRowStoreHelper .set(t = new GlobalRowStoreHelper(this)); } } } return globalRowStoreHelper.get(); } final private AtomicReference<GlobalRowStoreHelper> globalRowStoreHelper = new AtomicReference<GlobalRowStoreHelper>(); /* * global file system. * * Note: An atomic reference provides us with a "lock" object which doubles * as a reference. We are not relying on its CAS properties. */ @Override public BigdataFileSystem getGlobalFileSystem() { GlobalFileSystemHelper t = globalFileSystemHelper.get(); if (t == null) { synchronized (globalFileSystemHelper) { /* * Note: Synchronized to avoid race conditions when updating * (this allows us to always return our reference if we create a * new helper instance). */ t = globalFileSystemHelper.get(); if (t == null) { globalFileSystemHelper .set(t = new GlobalFileSystemHelper(this)); } } } return globalFileSystemHelper.get().getGlobalFileSystem(); } final private AtomicReference<GlobalFileSystemHelper> globalFileSystemHelper = new AtomicReference<GlobalFileSystemHelper>(); @Override protected void discardCommitters() { super.discardCommitters(); synchronized (globalRowStoreHelper) { /* * Note: Synchronized even though atomic. We are using this as an * mutable lock object without regard to its CAS behavior. */ globalRowStoreHelper.set(null); } synchronized (globalFileSystemHelper) { /* * Note: Synchronized even though atomic. We are using this as an * mutable lock object without regard to its CAS behavior. */ globalFileSystemHelper.set(null); } } @Override public TemporaryStore getTempStore() { return tempStoreFactory.getTempStore(); } private final TemporaryStoreFactory tempStoreFactory; @Override public IResourceLocator<?> getResourceLocator() { assertOpen(); return resourceLocator; } private final IResourceLocator<?> resourceLocator; @Override public IResourceLockService getResourceLockService() { assertOpen(); return resourceLockManager; } private final ResourceLockService resourceLockManager; @Override public ExecutorService getExecutorService() { assertOpen(); return executorService; } private final ThreadPoolExecutor executorService; /** * Used to sample and report on the queue associated with the * {@link #executorService}. May be used to schedule other tasks as well. */ private final ScheduledExecutorService scheduledExecutorService; /* * plugins. */ private final AtomicReference<IPlugIn<Journal, ThreadPoolExecutorBaseStatisticsTask>> pluginQueueStats = new AtomicReference<IPlugIn<Journal,ThreadPoolExecutorBaseStatisticsTask>>(); private final AtomicReference<IPlugIn<Journal, AbstractStatisticsCollector>> pluginPlatformStats = new AtomicReference<IPlugIn<Journal, AbstractStatisticsCollector>>(); private final AtomicReference<IPlugIn<Journal, ?>> pluginHttpd = new AtomicReference<IPlugIn<Journal, ?>>(); /** * An optional plug in for Ganglia. * <p> * Note: The plug in concept was introduced to decouple the ganglia * component. Do not introduce imports into the {@link Journal} class that * would make the ganglia code a required dependency! * * @see <a href="https://sourceforge.net/apps/trac/bigdata/ticket/609"> * bigdata-ganglia is required dependency for Journal </a> */ private final AtomicReference<IPlugIn<Journal, ?>> pluginGanglia = new AtomicReference<IPlugIn<Journal, ?>>(); /** * Host wide performance counters (collected from the OS) (optional). * * @see PlatformStatsPlugIn */ public AbstractStatisticsCollector getPlatformStatisticsCollector() { final IPlugIn<Journal, AbstractStatisticsCollector> plugin = pluginPlatformStats .get(); if (plugin == null) return null; final AbstractStatisticsCollector t = plugin.getService(); return t; } public Object getGangliaService() { final IPlugIn<Journal, ?> plugin = pluginGanglia.get(); if (plugin == null) return null; return plugin.getService(); } /** * An executor service used to read on the local disk. * * TODO This is currently used by prefetch. We should generalize this * mechanism, probably moving it to the {@link IResourceManager}, and use it * to do all IO, ideally using the JSR 166 fork/join mechanisms. Without * moving this method to another interface, pre-fetch will not work for * AbstractTask. * <p> * This should be reconciled with the {@link ConcurrencyManager}, which has * distinct {@link ExecutorService}s for readers and writers which control * the per-task concurrency while this controls the disk read concurrency. * <p> * We could use the same pool for readers and writers on the disk. */ public LatchedExecutor getReadExecutor() { // assertOpen(); return readService; } private final LatchedExecutor readService; /* * Warm-up Journal. */ /** * Warmup the indicated namespaces. * * @param namespaces * A list of zero or more namespaces to be warmed up (optional). * When <code>null</code> or empty, all namespaces will be warmed * up. * * @return A future for the task that is warming up the indices associated * with those namespace(s). The future evaluates to a map from the * name of the index to the statistics collected for that index * during the warmup procedure. * * @see <a href="http://trac.bigdata.com/ticket/1050" > pre-heat the journal * on startup </a> * * @see WarmUpTask */ public Future<Map<String, BaseIndexStats>> warmUp( final List<String> namespaces) { /* * The indices will be scanned with one thread per index. This parameter * determines the #of such scans that will execute in parallel. Since the * thread will block on any IO, you need a modestly large number of * threads here to enqueue enough disk reads to drive enough IOPs for an * efficient disk scan. */ final int nparallel = 20; final FutureTask<Map<String, BaseIndexStats>> ft = new FutureTask<Map<String, BaseIndexStats>>( new WarmUpTask(this, namespaces, ITx.READ_COMMITTED/* timestamp */, nparallel, false/* visitLeaves */)); getExecutorService().submit(ft); return ft; } /** * This task runs once starts an (optional) * {@link AbstractStatisticsCollector} and an (optional) httpd service. * <p> * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan * Thompson</a> */ private class StartDeferredTasksTask implements Runnable { /** * Note: The logger is named for this class, but since it is an inner * class the name uses a "$" delimiter (vs a ".") between the outer and * the inner class names. */ final private Logger log = Logger.getLogger(StartDeferredTasksTask.class); private StartDeferredTasksTask() { } @Override public void run() { try { startDeferredTasks(); } catch (Throwable t) { log.error(t, t); return; } } /** * Starts performance counter collection. */ protected void startDeferredTasks() throws IOException { // start collection on various work queues. { final IPlugIn<Journal, ThreadPoolExecutorBaseStatisticsTask> tmp = new QueueStatsPlugIn(); tmp.startService(Journal.this); // Save reference iff started. pluginQueueStats.set(tmp); } // start collecting performance counters (if enabled). { final IPlugIn<Journal, AbstractStatisticsCollector> tmp = new PlatformStatsPlugIn(); tmp.startService(Journal.this); pluginPlatformStats.set(tmp); } // start the local httpd service reporting on this service. { final IPlugIn<Journal, CounterSetHTTPD> tmp = new HttpPlugin(); tmp.startService(Journal.this); pluginHttpd.set(tmp); } /** * Start embedded ganglia peer. It will develop a snapshot of the * metrics in memory for all nodes reporting in the ganglia network * and will self-report metrics from the performance counter * hierarchy to the ganglia network. * * Note: Do NOT invoke this plug in unless it will start and run to * avoid a CLASSPATH dependency on bigdata-ganglia when it is not * used. The plugin requires platform statistics collection to run, * so if you do not want to have a CLASSPATH dependency on ganglia, * you need to disable the PlatformStatsPlugIn. * * @see <a * href="https://sourceforge.net/apps/trac/bigdata/ticket/609"> * bigdata-ganglia is required dependency for Journal </a> */ if (getPlatformStatisticsCollector() != null) { final IPlugIn<Journal, ?> tmp = new GangliaPlugIn(); tmp.startService(Journal.this); if (tmp.isRunning()) { // Save reference iff started. pluginGanglia.set(tmp); } } } } // class StartDeferredTasks @Override public ScheduledFuture<?> addScheduledTask(final Runnable task, final long initialDelay, final long delay, final TimeUnit unit) { if (task == null) throw new IllegalArgumentException(); if (log.isInfoEnabled()) log.info("Scheduling task: task=" + task.getClass() + ", initialDelay=" + initialDelay + ", delay=" + delay + ", unit=" + unit); return scheduledExecutorService.scheduleWithFixedDelay(task, initialDelay, delay, unit); } /** * {@inheritDoc} * * @see Options#COLLECT_PLATFORM_STATISTICS */ @Override final public boolean getCollectPlatformStatistics() { return Boolean.valueOf(properties.getProperty( Options.COLLECT_PLATFORM_STATISTICS, Options.DEFAULT_COLLECT_PLATFORM_STATISTICS)); } /** * {@inheritDoc} * * @see Options#COLLECT_QUEUE_STATISTICS */ @Override final public boolean getCollectQueueStatistics() { return Boolean.valueOf(properties.getProperty( Options.COLLECT_QUEUE_STATISTICS, Options.DEFAULT_COLLECT_QUEUE_STATISTICS)); } /** * {@inheritDoc} * * @see Options#HTTPD_PORT */ @Override final public int getHttpdPort() { return Integer.valueOf(properties.getProperty(Options.HTTPD_PORT, Options.DEFAULT_HTTPD_PORT)); } /* * Per index counters. */ /** * Canonical per-index {@link BTreeCounters}. These counters are set on each * {@link AbstractBTree} that is materialized by * {@link #getIndexOnStore(String, long, IRawStore)}. The same * {@link BTreeCounters} object is used for the unisolated, read-committed, * read-historical and isolated views of the index and for each source in * the view regardless of whether the source is a mutable {@link BTree} on * the live journal or a read-only {@link BTree} on a historical journal. * * @see #getIndexCounters(String) * @see #dropIndex(String) */ final private ConcurrentHashMap<String/* name */, BTreeCounters> indexCounters = new ConcurrentHashMap<String, BTreeCounters>(); @Override public BTreeCounters getIndexCounters(final String name) { if (name == null) throw new IllegalArgumentException(); // first test for existence. BTreeCounters t = indexCounters.get(name); if (t == null) { // not found. create a new instance. t = new BTreeCounters(); // put iff absent. final BTreeCounters oldval = indexCounters.putIfAbsent(name, t); if (oldval != null) { // someone else got there first so use their instance. t = oldval; } else { if (log.isInfoEnabled()) log.info("New counters: indexPartitionName=" + name); } } assert t != null; return t; } /** * A Journal level semaphore used to restrict applications to a single * unisolated connection. The "unisolated" connection is an application * level construct which supports highly scalable ACID operations but only a * single such "connection" can exist at a time for a Journal. This * constraint arises from the need for the application to coordinate * operations on the low level indices and commit/abort processing while it * holds the permit. * <p> * An AccessSemaphore that implements an Exclusive/SharedAccess idiom is used * to ensure exclusive Unisolated access and shared ReadWrite. */ private final AccessSemaphore accessSemaphore = new AccessSemaphore(Integer.MAX_VALUE/* max shared */); /** * Acquire an Access object for the UNISOLATED connection. * * @throws InterruptedException */ public Access acquireUnisolatedConnectionAccess() throws InterruptedException { return accessSemaphore.acquireExclusive(); } /** * Acquire an Access for a read/write isolated transaction. * * @see BLZG-2041 */ public Access acquireReadWriteConnectionAccess() throws InterruptedException { /* * Multiple ReadWrites are permitted shared access */ return accessSemaphore.acquireShared(); } @Override public boolean isHAJournal() { return false; } }