DistributedTransactionService.java example

Explorer
blazegraph-master
- database-master
/*

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     licenses@blazegraph.com

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

*/
/*
 * Created on Dec 18, 2008
 */

package com.bigdata.service;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Properties;
import java.util.UUID;
import java.util.concurrent.BrokenBarrierException;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.CyclicBarrier;
import java.util.concurrent.Future;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.Condition;
import java.util.zip.Adler32;

import com.bigdata.btree.BTree;
import com.bigdata.btree.IRangeQuery;
import com.bigdata.btree.ITuple;
import com.bigdata.btree.ITupleIterator;
import com.bigdata.concurrent.LockManager;
import com.bigdata.concurrent.LockManagerTask;
import com.bigdata.config.LongValidator;
import com.bigdata.counters.CounterSet;
import com.bigdata.counters.Instrument;
import com.bigdata.journal.IDistributedTransactionService;
import com.bigdata.journal.ITransactionService;
import com.bigdata.journal.ITx;
import com.bigdata.journal.Name2Addr;
import com.bigdata.journal.RunState;
import com.bigdata.util.concurrent.ExecutionExceptions;

/**
 * Implementation for an {@link IBigdataFederation} supporting both single-phase
 * commits (for transactions that execute on a single {@link IDataService}) and
 * distributed commits.
 * 
 * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
 * @version $Id$
 */
public abstract class DistributedTransactionService extends
        AbstractTransactionService implements IDistributedTransactionService {

    /**
     * Options understood by this service.
     * 
     * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
     */
    public interface Options extends AbstractTransactionService.Options {

        /**
         * The directory in which the persistent state of this service will be
         * stored.
         */
        String DATA_DIR = DistributedTransactionService.class.getName()
                + ".dataDir";

        /**
         * The interval in milliseconds between writing a snapshot of the index
         * of accessible commit points into the {@link #DATA_DIR} ({@value #DEFAULT_SHAPSHOT_INTERVAL}).
         * <p>
         * Two snapshots are retained of the commit time index so that those
         * historical commit times required for reading on committed states of
         * the database GT the <i>releaseTime</i> may be on hand after a
         * service restart. Two snapshots are maintained, with the older
         * snapshot being overwritten each time. A snapshot is written every N
         * milliseconds, where N is configured using this property, and also
         * when the service is shutdown.
         * <p>
         * This MAY be ZERO (0L) to disable snapshots - a feature that is used
         * by the {@link EmbeddedFederation} when run in a diskless mode.
         */
        String SHAPSHOT_INTERVAL = DistributedTransactionService.class
                .getName()
                + ".snapshotInterval";

        /** 5 minutes (in millseconds). */
        String DEFAULT_SHAPSHOT_INTERVAL = ""
                + (5 * 60 * 1000); 

    }

    /**
     * A map of the distributed transactions that are currently committing.
     * 
     * @todo config for initial capacity and concurrency?
     */
    private final ConcurrentHashMap<Long/* tx */, DistributedTxCommitTask/* state */> commitList = new ConcurrentHashMap<Long, DistributedTxCommitTask>();

    /**
     * The {@link LockManager} used to impose a partial ordering on the prepare
     * phase of distributed transaction commits using index partition names as
     * the named resources for which the tasks must contend.
     */
    private final LockManager<String> indexLockManager = new LockManager<String>(
            0/* maxConcurrencyIsIgnored */, true/* predeclareLocks */);

    /**
     * The {@link LockManager} used to impose a partial ordering on the commit
     * phase of distributed transaction commits using {@link IDataService}
     * {@link UUID}s as the named resources for which the tasks must contend.
     */
    private final LockManager<UUID> dataServiceLockManager = new LockManager<UUID>(
            0/* maxConcurrencyIsIgnored */, true/* predeclareLocks */);

    /**
     * A {@link BTree} containing a log of the historical commit points.
     * <p>
     * The main things that it gives us are (a) the half-open ranges within
     * which we can allocate read-historical transactions; and (b) the last
     * commit time on record. It seems that creating an image of the log every N
     * seconds should be sufficient.
     * <p>
     * Note: Read and write operations on this index MUST be synchronized on the
     * index object.
     */
    protected final CommitTimeIndex commitTimeIndex;

    /**
     * True iff the service does not write any state on the disk.
     */
    private final boolean isTransient;
    
    /**
     * The data directory -or- <code>null</code> iff the service is transient.
     */
    protected final File dataDir;

    /**
     * The interval in milliseconds between logging an image of the
     * {@link #commitTimeIndex}.
     * 
     * @see Options#COMMIT_TIME_INDEX_SHAPSHOT_INTERVAL
     */
    private final long snapshotInterval;
    
    /**
     * The last (known) commit time.
     */
    private volatile long lastCommitTime = 0L;

    /**
     * @param properties
     */
    public DistributedTransactionService(final Properties properties) {

        super(properties);

        if (properties.getProperty(Options.DATA_DIR) == null) {

            throw new RuntimeException("Required property: " + Options.DATA_DIR);

        }
        
        snapshotInterval = LongValidator.GTE_ZERO.parse(
                Options.SHAPSHOT_INTERVAL, properties.getProperty(
                        Options.SHAPSHOT_INTERVAL,
                        Options.DEFAULT_SHAPSHOT_INTERVAL));

        if (log.isInfoEnabled())
            log.info(Options.SHAPSHOT_INTERVAL + "=" + snapshotInterval);

        isTransient = snapshotInterval == 0;

        if (isTransient) {
            
            dataDir = null;
            
        } else {
            
            dataDir = new File(properties.getProperty(Options.DATA_DIR));

            if (log.isInfoEnabled())
                log.info(Options.DATA_DIR + "=" + dataDir);
            
        }
        
        // Create transient BTree for the commit time log.
        commitTimeIndex = CommitTimeIndex.createTransient();

        setup();
        
        if (log.isInfoEnabled())
            log.info("lastCommitTime=" + lastCommitTime + ", #commitTimes="
                    + commitTimeIndex.getEntryCount());
        
    }

    /**
     * Either creates the data directory or reads the {@link #commitTimeIndex}
     * from files in an existing data directory.
     */
    private void setup() {

        if(isTransient) {
            
            // nothing committed yet.
            lastCommitTime = 0L;

            return;

        }
        
        if (!dataDir.exists()) {

            /*
             * New service if its data directory does not exist.
             */

            if (!dataDir.mkdirs() && !dataDir.mkdirs()) {

                throw new RuntimeException("Could not create: " + dataDir);

            }

            // nothing committed yet.
            lastCommitTime = 0L;

            return;

        }

        {

            // the files on which the images should have been written.
            final File file0 = new File(dataDir, BASENAME + "0" + EXT);

            final File file1 = new File(dataDir, BASENAME + "1" + EXT);

            if (!file0.exists() && !file1.exists()) {

                log.warn("No commit time logs - assuming new service: dataDir="
                        + dataDir);

                // nothing committed yet.
                lastCommitTime = 0L;

                return;

            }
            // timestamps on those files (zero if the file does not exist)
            final long time0 = file0.lastModified();
            final long time1 = file1.lastModified();

            // true iff file0 is more recent.
            final boolean isFile0 = (time0 != 0L && time1 != 0L) //
                ? (time0 > time1 ? true: false)// Note: both files exist.
                : (time0 != 0L ? true: false)// Note: only one file exists
                ;

            final File file = isFile0 ? file0 : file1;

//            System.err.println("file0: "+file0.lastModified());
//            System.err.println("file1: "+file1.lastModified());
//            System.err.println("isFile0="+isFile0);

            /*
             * Note: On restart the value of this counter is set to either
             * ONE(1) or TWO(1) depending on which snapshot file is more
             * current.
             * 
             * It is ONE(1) if we read file0 since the counter would be ONE(1)
             * after we write file0 for the first time.
             * 
             * It is TWO(2) if we read file1 since the counter would be TWO(2)
             * after we write file1 for the first time.
             */
            snapshotCount = isFile0 ? 1 : 2;

            try {

                // read most recent image.
                final long entryCount = SnapshotHelper.read(commitTimeIndex,
                        file);

                log.warn("Read snapshot: entryCount=" + entryCount + ", file="
                        + file);
                
            } catch (IOException ex) {

                throw new RuntimeException("Could not read file: " + file, ex);

            }

        }

        if (commitTimeIndex.getEntryCount() == 0) {

            // nothing in the commit time log.
            lastCommitTime = 0;

        } else {

            // the last commit time in the log. @todo write unit test to
            // verify on restart.
            lastCommitTime = commitTimeIndex.decodeKey(commitTimeIndex
                    .keyAt(commitTimeIndex.getEntryCount() - 1));

        }

    }

    /**
     * Basename for the files written in the {@link #dataDir} containing images
     * of the {@link #commitTimeIndex}.
     */
    static protected final String BASENAME = "commitTime";
    
    /**
     * Extension for the files written in the {@link #dataDir} containing
     * snapshots of the {@link #commitTimeIndex}.
     */
    static protected final String EXT = ".snapshot";
    
    /**
     * #of times we have written a snapshot of the {@link #commitTimeIndex}.
     */
    private long snapshotCount = 0L;
    
    /**
     * Runs the {@link SnapshotTask} once.
     */
    public void snapshot() {
        
        new SnapshotTask().run();

    }
    
    /**
     * A task that writes a snapshot of the commit time index onto a pair of
     * alternating files. This is in the spirit of the Challis algorithm, but
     * the approach is less rigorous.
     * 
     * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
     */
    private class SnapshotTask implements Runnable {
      
        /**
         * Note: Anything thrown out of this method will cause the task to no
         * longer be scheduled!
         */
        public void run() {
            
            if(isTransient) {

                // snapshot not supported for transient service.
                throw new RuntimeException("Service is transient");
                
            }
            
            lock.lock();

            try {

                final long begin = System.currentTimeMillis();

                // either 0 or 1.
                final int i = (int) snapshotCount % 2;

                final File file = new File(dataDir, BASENAME + i + EXT);
                
                if (!dataDir.exists()) {

                    if (!dataDir.mkdirs() && !dataDir.mkdirs()) {

                        throw new RuntimeException("Could not create: " + dataDir);

                    }
                    
                }

                final long entryCount;
                synchronized (commitTimeIndex) {

                    entryCount = SnapshotHelper.write(commitTimeIndex, file);

                }

                // increment counter iff successful.
                snapshotCount++;

                final long elapsed = System.currentTimeMillis() - begin;

                log.warn("snapshot: snapshotCount=" + snapshotCount
                        + ", entryCount=" + entryCount + ", file=" + file
                        + ", elapsed=" + elapsed);

            } catch (Throwable t) {

                log.error(t.getMessage(), t);

                return;

            } finally {

                lock.unlock();

            }

        }
        
    };
    
    /**
     * A helper class for reading and writing snapshots of the commit time
     * index. The image contains the commit timestamps in order.
     * <p>
     * Note: The caller must prevent concurrent changes to the index.
     * 
     * @todo write counters into the files since the system clock could be
     *       messed with on before a restart but the counters will always be
     *       valid. we would then either read both and choose one, or have a
     *       method to report the header with the earlier counter.
     * 
     * @todo Checksum the commit time log file? this is easily done either using
     *       a {@link ByteBuffer} or using {@link Adler32}.
     * 
     * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
     */
    public static class SnapshotHelper {

        static public long read(CommitTimeIndex ndx, File file)
                throws IOException {

            final FileInputStream is = new FileInputStream(file);

            try {

                final BufferedInputStream bis = new BufferedInputStream(is);

                final DataInputStream dis = new DataInputStream(bis);

                return SnapshotHelper.read(ndx, dis);

            } finally {

                is.close();

            }

        }
        
        static public long read(CommitTimeIndex ndx, DataInputStream is)
                throws IOException {

            final long n = is.readLong();

            for (int i = 0; i < n; i++) {

                ndx.add(is.readLong());

            }

            return n;
            
        }

        static public long write(final CommitTimeIndex ndx, final File file)
                throws IOException {

            final FileOutputStream os = new FileOutputStream(file);

            try {

                final BufferedOutputStream bos = new BufferedOutputStream(os);

                final DataOutputStream dos = new DataOutputStream(bos);

                // write the image on the file.
                final long entryCount = SnapshotHelper.write(ndx, dos);

                dos.flush();
                
                bos.flush();

                return entryCount;
                
            } finally {

                os.close();

            }

        }
        
        static public long write(final CommitTimeIndex ndx,
                final DataOutputStream os) throws IOException {

            final long entryCount = ndx.getEntryCount();
            
            os.writeLong(entryCount);
            
            final ITupleIterator itr = ndx.rangeIterator();

            int n = 0;
            
            while (itr.hasNext()) {

                final ITuple tuple = itr.next();

                final long commitTime = ndx.decodeKey(tuple.getKey());

                os.writeLong(commitTime);

                n++;
                
            }
            
            if (n != entryCount) {
                
                /*
                 * Note: probable error is the caller not preventing concurrent
                 * modification.
                 */
                
                throw new AssertionError();
                
            }
            
            return entryCount;
            
        }
        
    }
    
    public DistributedTransactionService start() {

        /*
         * Note: lock makes operation _mostly_ atomic even though the base class
         * changes the runState. For example, new transactions can not start
         * without this lock.
         */
        lock.lock();

        try {

            super.start();

            addScheduledTasks();
            
            return this;

        } finally {

            lock.unlock();

        }
        
    }
    
    /**
     * Adds the scheduled tasks.
     */
    protected void addScheduledTasks() {

        if (!lock.isHeldByCurrentThread())
            throw new IllegalMonitorStateException();

        final AbstractFederation fed = (AbstractFederation) getFederation();

        // @todo config options (verify units).
        notifyFuture = fed.addScheduledTask(new NotifyReleaseTimeTask(),
                60/* initialDelay */, 60/* delay */, TimeUnit.SECONDS);

        if (snapshotInterval != 0L) {

            // start the snapshot task.

            writeFuture = fed.addScheduledTask(
                    new SnapshotTask(),
                    snapshotInterval/* initialDelay */,
                    snapshotInterval/* delay */,
                    TimeUnit.MILLISECONDS);

        }

    }
    
    private ScheduledFuture notifyFuture = null;
    private ScheduledFuture writeFuture = null;
    
    public void shutdown() {
        
        lock.lock();
        try {

            switch (getRunState()) {
            case Shutdown:
            case ShutdownNow:
            case Halted:
                return;
            }

            /*
             * First make sure that all tx are terminated - this is important
             * otherwise we will write the commit time index image before we
             * have the last commit times on hand.
             */
            super.shutdown();

            /*
             * No need to interrupt this task. It will complete soon enough.
             * However, we do want to cancel it so it will stop running.
             */
            if (notifyFuture != null)
                notifyFuture.cancel(false/* mayInterruptIfRunning */);

            /*
             * Cancel this task, but DO NOT interrupt it to avoid a partial
             * write if there is a write in progress. If there is a write in
             * progress, then we will wind up writing it again immediately since
             * we do that below. This is Ok. We will just have a current image
             * and a nearly current image.
             */
            if (writeFuture != null)
                writeFuture.cancel(false/* mayInterruptIfRunning */);

            if (snapshotInterval != 0L) {
             
                // write a final image during shutdown.
                new SnapshotTask().run();
                
            }

        } finally {

            lock.unlock();

        }

    }

    public void shutdownNow() {

        lock.lock();
        try {
            
            switch (getRunState()) {
            case ShutdownNow:
            case Halted:
                return;
            }

            /*
             * First make sure that all tx are terminated - this is important
             * otherwise we will write the commit time index image before we
             * have the last commit times on hand.
             */
            super.shutdownNow();

            /*
             * Cancel and interrupt if running.
             */
            if (notifyFuture != null)
                notifyFuture.cancel(true/* mayInterruptIfRunning */);

            /*
             * Cancel this task and interrupt if running. Interrupting this will
             * leave a partial snapshot on the disk, but we do not advance the
             * counter unless the snapshot is successful so we will overwrite
             * that partial snapshot below when we write a final snapshot.
             */
            if (writeFuture != null)
                writeFuture.cancel(true/* mayInterruptIfRunning */);

            if (snapshotInterval != 0L) {

                // write a final snapshot during shutdown.
                snapshot();
                
            }

        } finally {

            lock.unlock();
            
        }

    }
    
    public void destroy() {

        lock.lock();

        try {

            super.destroy();

            if (!isTransient) {

                // delete the commit time index log files.
                new File(dataDir, BASENAME + "0" + EXT).delete();
                new File(dataDir, BASENAME + "1" + EXT).delete();

                // delete the data directory (works iff it is empty).
                dataDir.delete();

            }

        } finally {

            lock.unlock();
            
        }
        
    }

    /**
     * Extended to truncate the head of the {@link #commitTimeIndex} such only
     * the commit times requires for reading on timestamps GTE to the new
     * releaseTime are retained.
     */
    protected void setReleaseTime(long releaseTime) {
        
        super.setReleaseTime(releaseTime);

        /*
         * Truncate the head of the commit time index since we will no longer
         * grant transactions whose start time is LTE the new releaseTime.
         */
        
        // Note: Use the current value.
        releaseTime = getReleaseTime();
        
        if (releaseTime > 0) {
        
            synchronized (commitTimeIndex) {

                /*
                 * The exclusive upper bound is the timestamp of the earliest
                 * commit point on which we can read with this [releaseTime].
                 */
                final long toKey = commitTimeIndex.find(releaseTime + 1);

                final ITupleIterator itr = commitTimeIndex.rangeIterator(0L,
                        toKey, 0/* capacity */, IRangeQuery.KEYS
                                | IRangeQuery.CURSOR, null/* filter */);

                while (itr.hasNext()) {

                    itr.next();

                    // remove the tuple from the index.
                    itr.remove();

                }

            }
            
        }
        
    }
    
    /**
     * Return the proxies for the services participating in a distributed
     * transaction commit or abort.
     * <p>
     * Note: This method is here so that it may be readily overriden for unit
     * tests.
     * 
     * @param uuids
     *            The {@link UUID}s of the participating services.
     * 
     * @return The corresponding service proxies.
     */
    protected ITxCommitProtocol[] getDataServices(UUID[] uuids) {
        
        return getFederation().getDataServices(uuids);
        
    }
    
    /**
     * Task runs {@link ITxCommitProtocol#abort(long)}.
     * 
     * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
     */
    private static class AbortTask implements Callable<Void> {
        
        private final ITxCommitProtocol service;
        private final TxState state;
        
        public AbortTask(final ITxCommitProtocol service, final TxState state) {
            
            if (service == null)
                throw new IllegalArgumentException();
            
            if (state == null)
                throw new IllegalArgumentException();
            
            this.service = service;
            
            this.state = state;
            
        }
        
        public Void call() throws Exception {
            
            service.abort(state.tx);

            return null;
            
        }
        
    }
    
    @Override
    protected void abortImpl(final TxState state) throws Exception {

        if(!state.lock.isHeldByCurrentThread())
            throw new IllegalMonitorStateException();

        if (!state.isActive())
            throw new IllegalStateException();

        if(state.isReadOnly()) {
    
            /*
             * Note: There is no local state for read-only tx so we do not need
             * to message the data services.
             */
            
            state.setRunState(RunState.Aborted);
            
            return;
            
        }

        final UUID[] uuids = state.getDataServiceUUIDs();

        final ITxCommitProtocol[] services = getDataServices(uuids);

        final List<Callable<Void>> tasks = new ArrayList<Callable<Void>>(
                uuids.length);
        
        for (ITxCommitProtocol dataService : services) {

            tasks.add(new AbortTask(dataService, state));

        }

        final List<Future<Void>> futures = getFederation().getExecutorService()
                .invokeAll(tasks);

        List<Throwable> causes = null;

        for (Future<Void> f : futures) {

            try {

                // verify no errors.
                f.get();

            } catch (Throwable t) {

                /*
                 * Collect all causes and always log an error if any data
                 * service abort fails.
                 * 
                 * Note: If an exception is thrown here the transaction will be
                 * aborted regardless. Howwever, the data service which threw
                 * the exception may still have local state on hand for the tx.
                 */

                log.error(t, t);

                if (causes == null) {

                    causes = new LinkedList<Throwable>();

                }

                causes.add(t);
                
            }

        }
        
        state.setRunState(RunState.Aborted);
        
        if (causes != null) {
            
            throw new ExecutionExceptions(state.toString(), causes);
            
        }
        
    }

    /**
     * There are two distinct commit protocols depending on whether the
     * transaction write set is distributed across more than one
     * {@link IDataService}. When write set of the transaction lies entirely on
     * a single {@link IDataService}, an optimized commit protocol is used.
     * When the write set of the transaction is distributed, a 3-phase commit is
     * used with most of the work occurring during the "prepare" phase and a
     * very rapid "commit" phase. If a distributed commit fails, even during the
     * "commit", then the transaction will be rolled back on all participating
     * {@link IDataService}s.
     * 
     * <h3>Single phase commits</h3>
     * 
     * A simple commit protocol is used when the write set of the transaction
     * resides entirely on a single {@link IDataService}. Such commits DO NOT
     * contend for named resource locks (either on the index names or on the
     * {@link IDataService} {@link UUID}s). Since such transactions DO NOT have
     * dependencies outside of the specific {@link IDataService}, a necessary
     * and sufficient partial order will be imposed on the executing tasks
     * locally by the {@link IDataService} on which they are executing based
     * solely on the named resources which they declare. Without dependencies on
     * distributed resources, this can not deadlock.
     * 
     * <h3>Distributed commits</h3>
     * 
     * Transaction commits for a distributed database MUST be prepared in a
     * partial order so that they do not deadlock when acquiring the necessary
     * locks on the named indices on the local data services. That partial order
     * is imposed using the {@link #indexLockManager}. The named index locks
     * are pre-declared at the start of the distributed commit protocol and are
     * held through both the prepare and commit phases until the end of the
     * commit protocol. The distributed commit must obtain a lock on all of the
     * necessary named index resources before proceeding. If there is an
     * existing commit using some of those resources, then any concurrent commit
     * requiring any of those resources will block. The {@link LockManager} is
     * configured to require pre-declaration of locks. Deadlocks are NOT
     * possible when the locks are pre-declared.
     * <p>
     * A secondary partial ordering is established based on the
     * {@link IDataService} {@link UUID}s during the commit phase. This partial
     * order is necessary to avoid deadlocks for concurrently executing commit
     * phases of distributed transactions that DO NOT share named index locks.
     * Without a partial order over the participating {@link IDataService}s,
     * deadlocks could arise because each transaction will grab an exclusive
     * lock on the write service for each participating {@link IDataService}.
     * By ordering those lock requests, we again ensure that deadlocks can not
     * occur.
     * <p>
     * Note: The prepare phase for distributed commits allows the maximum
     * possible concurrency. This is especially important as validation and
     * merging down onto the unisolated indices can have significant length for
     * large transactions.
     * <p>
     * The commit phase should be very fast, with syncing the disk providing the
     * primary source of latency. All participating indices on the participating
     * data services have already been checkpointed. Once the commitTime is
     * assigned by the {@link DistributedTransactionService}, the group commit
     * need only update the root block on the live journal and sync to disk.
     * 
     * @todo Place timeout on the commit phase where the tx will abort unless
     *       all participants join at the "committed" barrier within ~ 250ms.
     *       That should be a generous timeout, but track aborts for this reason
     *       specifically since they may indicate interesting problems (heavy
     *       swapping, network issues, etc).
     * 
     * @todo make sure that we checkpoint the commit record index and
     *       {@link Name2Addr} before requesting the commitTime to remove even
     *       more latency.
     */
    @Override
    protected long commitImpl(final TxState state) throws Exception {
        
        if (state.isReadOnly() || state.getDataServiceCount() == 0) {
            
            /*
             * Note: We do not maintain any transaction state on the client for
             * read-only transactionss.
             * 
             * Note: If the transaction was never started on any data service so
             * we do not need to notify any data service. In effect, the tx was
             * started but never used.
             */

            state.setRunState(RunState.Committed);
            
            return 0L;
            
        }
     

        if (!state.isDistributedTx()) {

            /*
             * The write set of the transaction is local to a single data
             * service. In this case we can do a much simpler commit protocol.
             */

            return singlePhaseCommit(state);
            
        }

        /*
         * The LockManagerTask will handle lock acquisition for the named
         * resources and then invoke our task to perform the commit.
         */
        final LockManagerTask<String, Long> delegate = new LockManagerTask<String, Long>(
                indexLockManager, state.getResources(), new DistributedTxCommitTask(state));

        /*
         * This queues the request until it holds the necessary locks (on the
         * named indices used by the transaction). It then prepares the
         * transaction and (if successfull) requests the necessary locks for the
         * commit phase (on the data service UUIDs) and then commits the tx.
         */

        return delegate.call();
        
    }

    /**
     * Prepare and commit a read-write transaction that has written on a single
     * data service.
     */
    protected long singlePhaseCommit(final TxState state) throws Exception {

        if(!state.lock.isHeldByCurrentThread())
            throw new IllegalMonitorStateException();

        final UUID[] uuids = state.getDataServiceUUIDs();

        if (uuids.length != 1)
            throw new AssertionError();

        final UUID serviceUUID = uuids[0];

        final IDataService dataService = getFederation().getDataService(
                serviceUUID);

        try {

            final long commitTime = dataService.singlePhaseCommit(state.tx);

            state.setRunState(RunState.Committed);

            return commitTime;

        } catch (Throwable t) {

            state.setRunState(RunState.Aborted);

            throw new RuntimeException(t);

        }

    }
    
    /**
     * <p>
     * Task runs the distributed commit protocol transaction.
     * </p>
     * Pre-conditions:
     * <p>
     * <ul>
     * <li>The transaction has a distributed write set (this does too much work
     * for a transaction whose write set is local to a single data service).</li>
     * <li>The caller holds the locks for the named index resources declared by
     * the transaction.</li>
     * <li>The transaction {@link TxState#isActive()}.</li>
     * </ul>
     * </p>
     * <p>
     * Post-conditions (success):
     * <ul>
     * <li>The transaction was assigned a <i>revisionTime</i>.</li>
     * <li>All participating data services validated the write set of the
     * transaction using that <i>revisionTime</i> and merge down the write set
     * of the transaction onto the corresponding unisolated indices.</li>
     * <li>The transaction was assigned a <i>commitTime</i>.</li>
     * <li>All participating data services have made the write set of the
     * transaction restart safe and marked the transaction as "committed" in
     * their local data.</li>
     * <li>The transaction {@link TxState#isCommitted()}.</li>
     * </ul>
     * </p>
     * <p>
     * Post-conditions (failure):
     * <ul>
     * <li>The transaction {@link TxState#isAborted()}.</li>
     * <li>Each participating data service has been notified that the
     * transaction was aborted.</li>
     * </ul>
     * </p>
     * 
     * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
     */
    private class DistributedTxCommitTask implements Callable<Long> {

        private final TxState state;

        /**
         * The {@link UUID}s of the participating {@link IDataService}s.
         */
        private final UUID[] uuids;

        /**
         * The proxies for the participating {@link IDataService}s.
         */
        private final ITxCommitProtocol[] services;

        /**
         * The #of participating {@link IDataService}s.
         */
        private final int nservices;

        /**
         * The revision time (assigned once the task begins to execute with all
         * locks held for the named index partitions).
         */
        private long revisionTime;

        /**
         * The commit time (assigned once the prepared barrier breaks and all
         * locks are held for the participating data services).
         * <p>
         * Note: This field is for debugging only.
         */
        private long commitTime;

        /**
         * The thread in which the {@link DistributedTxCommitTask} is executing.
         * This is the {@link Thread} that is used to obtain the locks for the
         * commit phase using the
         * {@link DistributedTransactionService#dataServiceLockManager}.
         */
        final Thread commitThread;

        /**
         * Condition is signaled when the "prepared" barrier breaks.
         * <p>
         * Note: If the barrier does not break because a participate fails then
         * the {@link #commitThread} MUST be interrupted in order for it to awaken.
         */
        final Condition prepared;
        
        /**
         * Condition is signaled when the necessary locks are held for the
         * participating {@link IDataService}s.
         * 
         * @see DistributedTransactionService#dataServiceLockManager
         */
        final Condition locksHeld;
        
        /**
         * Condition is signaled when the "committed" barrier breaks.
         */
        final Condition committed;
        
        /**
         * Barrier used to await the
         * {@link ITransactionService#prepared(long, UUID)} messages during a
         * distributed read-write transaction commit.
         */
        CyclicBarrier preparedBarrier = null;

        /**
         * Barrier used to await the
         * {@link ITransactionService#committed(long, UUID)} messages during a
         * distributed read-write transaction commit.
         */
        CyclicBarrier committedBarrier = null;

        public DistributedTxCommitTask(final TxState state) {

            if (state == null)
                throw new IllegalArgumentException();

            /*
             * Note: If this thread is holding the lock on [TxState] then no
             * other thread can access that object. This issue is resolved by
             * creating [Condition]s on which this thread awaits based on
             * TxState.lock.
             */
            if(!state.lock.isHeldByCurrentThread())
                throw new IllegalMonitorStateException();
            
            this.state = state;

            // The UUIDs of the participating (meta)dataServices.
            this.uuids = state.getDataServiceUUIDs();

            // The corresponding data services (resolve before acquiring locks).
            this.services = getDataServices(uuids);
            
            this.nservices = uuids.length;

            // Note: Same thread required for ctor and execution!
            this.commitThread = Thread.currentThread();
            
            this.prepared = state.lock.newCondition();

            this.locksHeld = state.lock.newCondition();

            this.committed = state.lock.newCondition();
            
        }

        /**
         * This method will be invoked by the {@link LockManagerTask} once it
         * holds all of the necessary named index resource locks. This is how we
         * impose a partial order for preparing the transaction. Deadlocks can
         * not arise because we predeclare the locks and {@link LockManager} can
         * guarantee no deadlocks in that case by sorting the requested
         * resources and acquiring the locks in the sorted order.
         */
        public Long call() throws Exception {

            assert this.commitThread == Thread.currentThread();
            
            return distributedCommit(state);

        }

        /**
         * Prepare and commit a read-write transaction that has written on more
         * than one data service.
         * <p>
         * Note: read-write transactions that have written on multiple journals
         * must use a distributed (2-/3-phase) commit protocol. As part of the
         * commit protocol, we obtain an exclusive write lock on each journal on
         * which the transaction has written. This is necessary in order for the
         * transaction as a whole to be assigned a single commit time. Latency
         * is critical in this commit protocol since the journals participating
         * in the commit will be unable to perform any unisolated operations
         * until the transaction either commits or aborts.
         * <p>
         * Note: There is an assumption that the revisionTime does not need to
         * be the commitTime. This allows us to get all the heavy work done
         * before we reach the "prepared" barrier, which means that the commit
         * phase should be very fast. The assumption is that validation of
         * different transactions writing on the same unisolated indices is in
         * fact serialized. The transaction services MUST enforce that
         * assumption by serializing distributed commits (at least those which
         * touch the same index partitions (necessary constraint), the same
         * indices (sufficient constraint) or the same {@link IDataService}s
         * (sufficient constraint)). If it did not serialize distributed commits
         * then <strong>deadlocks</strong> could arise where two distributed
         * commits were each seeking the exclusive write lock on resources, one
         * of which was already held by the other commit.
         * 
         * @throws Exception
         *             if anything goes wrong.
         * 
         * @return The commit time for the transaction.
         */
        protected long distributedCommit(final TxState state) throws Exception {

            if(!state.lock.isHeldByCurrentThread())
                throw new IllegalMonitorStateException();

            // choose the revision timestamp.
            this.revisionTime = nextTimestamp();

            // add to map of concurrently committing distributed transactions.
            commitList.put(state.tx, this);

            try {

                /*
                 * Submit a task that will run issue the prepare(tx,rev)
                 * messages to each participating data service and await its
                 * future.
                 */
                call2();

                return commitTime;
                
            } finally {

                commitList.remove(state.tx);
                
            }
            
        }

        /**
         * Setups up the {@link TxState#preparedBarrier} and the
         * {@link TxState#committedBarrier} and then runs the
         * {@link PrepareTask} tasks.
         * <p>
         * Post-conditions: {@link TxState#isComplete()} will be true. The
         * transaction will either have been aborted or committed on all
         * {@link IDataService}s.
         * 
         * @return The assigned commit time.
         * 
         * @todo Allow interrupt of the data service committers if any task
         *       fails during prepare() rather than having to wait for all of
         *       those tasks to join at the {@link TxState#preparedBarrier}.
         *       This is only an optimization. We would cancel those tasks using
         *       the {@link TaskRunner}'s {@link Future}.
         */
        public Void call2() throws Exception {

            Future<?> taskRunnerFuture = null;
            try {

                setupPreparedBarrier();

                setupCommittedBarrier();

                taskRunnerFuture = getFederation().getExecutorService().submit(
                        new TaskRunner());

                /*
                 * Signaled when the prepared barrier breaks. Interrupted if
                 * the prepare phase fails.
                 */
                prepared.await();

                /**
                 * Runs an inner Callable once we have the data service UUID
                 * locks.
                 * <p>
                 * Note: The purpose of this task is to hold onto those locks
                 * until the commit is finished (either success or failure). The
                 * locks are automatically release once the inner Callable
                 * completes regardless of the outcome.
                 * <p>
                 * Note: This task will run in the same thread as the caller.
                 * This means that the task will already hold the
                 * {@link TxState#lock}.
                 */
                new LockManagerTask<UUID, Void>(dataServiceLockManager, state
                        .getDataServiceUUIDs(), new Callable<Void>() {

                    public Void call() throws Exception {

                        if (!state.lock.isHeldByCurrentThread()) {

                            /*
                             * Note: The task runs in its caller's thread and
                             * the caller should already be holding the TxState
                             * lock.
                             */
                            
                            throw new IllegalMonitorStateException();

                        }

                        /*
                         * Signal so that the task which caused the prepared
                         * barrier to break can resume. It turn, when the
                         * prepared runnable finishes, all tasks awaiting that
                         * barrier will continue to execute and will enter their
                         * "commit" phase.
                         */
                        locksHeld.signal();

                        // Signaled when the committed barrier breaks.
                        committed.await();

                        return null;

                    }

                }).call();

                // Done.
                return null;

            } finally {

                /*
                 * Reset the barriers in case anyone is waiting.
                 */

                if (preparedBarrier != null)
                    preparedBarrier.reset();

                if (committedBarrier != null)
                    committedBarrier.reset();

                /*
                 * Await the future on the task running the PrepareTasks.
                 * 
                 * Note: This task SHOULD complete very shortly after a
                 * successful commit.
                 * 
                 * Note: If any PrepareTask fails, then all PrepareTasks should
                 * abort shortly thereafter.
                 */
                if (taskRunnerFuture != null)
                    taskRunnerFuture.get();

            }
            
        }

        /**
         * Submits the {@link PrepareTask}s in a different thread, awaits their
         * {@link Future}s and logs any errors.
         * <p>
         * Note: The {@link PrepareTask}s are executed outside of the thread
         * that runs the {@link DistributedTxCommitTask} so that we may use the
         * thread running the {@link DistributedTxCommitTask} to obtain locks
         * from the {@link DistributedTransactionService#dataServiceLockManager}.
         * 
         * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan
         *         Thompson</a>
         */
        private class TaskRunner implements Callable<Void> {

            public TaskRunner() {
                
            }
            
            public Void call() throws Exception {

                // The task MUST NOT run in the commitThread.
                assert commitThread != Thread.currentThread();

                // This thread MUST NOT own the lock.
                assert !state.lock.isHeldByCurrentThread();
                
                /*
                 * The futures for the tasks used to invoke prepare(tx,rev) on
                 * each dataService.
                 */
                final List<Future<Void>> futures;
                final List<Callable<Void>> tasks = new ArrayList<Callable<Void>>(
                        nservices);

                for (ITxCommitProtocol dataService : services) {

                    tasks.add(new PrepareTask(dataService));

                }

                try {

                    /*
                     * Await all futures, returning once they are all done.
                     */

                    futures = getFederation().getExecutorService().invokeAll(
                            tasks);

                    // tx must be complete (either committed or aborted).
                    assert state.isComplete() : state.toString();

                } catch (Throwable t) {

                    /*
                     * If we can not invoke all tasks then abort
                     */

                    log.error(t.getLocalizedMessage(), t);

                    state.setRunState(RunState.Aborted);

                    throw new RuntimeException(t);

                }

                List<Throwable> causes = null;
                for (Future f : futures) {

                    try {

                        f.get();

                    } catch (Throwable t) {

                        if (causes == null) {

                            causes = new LinkedList<Throwable>();

                        }

                        causes.add(t);

                        log.error(t.getLocalizedMessage(), t);

                    }

                }

                if (causes != null) {

                    final int nfailed = causes.size();

                    state.setRunState(RunState.Aborted);

                    throw new ExecutionExceptions("Committer(s) failed: n="
                            + nservices + ", nfailed=" + nfailed, causes);

                }

                return null;
                
            } // call()

        } // class TaskRunner
    
        /**
         * Sets up the {@link TxState#preparedBarrier}. When the barrier action
         * runs it will change {@link RunState} to {@link RunState#Prepared} and
         * assign a <em>commitTime</em> to the transaction. When the barrier
         * breaks, the assigned <i>commitTime</i> will be reported back to the
         * {@link IDataService}s waiting in
         * {@link ITransactionService#prepared(long, UUID)} as the return value
         * for that method.
         */
        private void setupPreparedBarrier() {

            preparedBarrier = new CyclicBarrier(nservices,

            new Runnable() {

                /**
                 * Method runs when the "prepared" barrier breaks.
                 */
                public void run() {

                    state.lock.lock();

                    try {

                        state.setRunState(RunState.Prepared);

                        /*
                         * Wake up the main thread. It will obtain the necessary
                         * locks for the participating data services and then
                         * signal that we may continue.
                         */
                        prepared.signal();

                        try {
                            // wait until the necessary locks are held.
                            locksHeld.await();
                        } catch (InterruptedException ex) {
                            log.warn("Interrupted", ex);
                            // re-throw the exception.
                            throw new RuntimeException(ex);
                        }
                        
                        // assign a commitTime to this tx.
                        final long commitTime = nextTimestamp();
                        
                        // Set the commitTime on the outer task.
                        DistributedTxCommitTask.this.commitTime = commitTime;
                        
                        // Set the commitTime on the tx.
                        state.setCommitTime(commitTime);

                    } finally {

                        state.lock.unlock();

                    }

                }

            });

        }

        /**
         * Sets up the {@link TxState#committedBarrier}. When the barrier
         * action runs it will change the {@link RunState} to
         * {@link RunState#Committed}.
         */
        protected void setupCommittedBarrier() {

            committedBarrier = new CyclicBarrier(nservices,

            new Runnable() {

                /**
                 * Method runs when the "committed" barrier breaks. At this
                 * point the transaction is fully committed on the participating
                 * data services.
                 */
                public void run() {

                    state.lock.lock();

                    try {

                        // wake up the main thread.
                        committed.signal();
                        
                        // Set the assigned commitTime on the TxState.
                        state.setCommitTime(commitTime);

                        // Change the tx run state.
                        state.setRunState(RunState.Committed);

                    } finally {

                        state.lock.unlock();

                    }

                }

            });

        }
        
        /**
         * Task issues {@link ITxCommitProtocol#prepare(long, long)} to an
         * {@link IDataService} participating in a distributed commit.
         * 
         * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan
         *         Thompson</a>
         */
        protected class PrepareTask implements Callable<Void> {

            final ITxCommitProtocol service;
            
            public PrepareTask(final ITxCommitProtocol service) {

                this.service = service;
                
            }

            public Void call() throws Exception {

                try {

                    service.prepare(state.tx, revisionTime);

                } catch (Throwable e) {

                    /*
                     * If an exception is thrown, then make sure that the tx
                     * is in the [Abort] state.
                     */
                    try {
                        log.error(e.getLocalizedMessage(), e);
                    } catch (Throwable t) {
                        // ignored
                    }
                    
                    state.lock.lock();

                    try {

                        state.setRunState(RunState.Aborted);

                    } finally {

                        state.lock.unlock();

                    }
                    
                    throw new RuntimeException(e);

                }

                return null;

            }

        }

    }

    /**
     * Note: Only those {@link DataService}s on which a read-write transaction
     * has started will participate in the commit. If there is only a single
     * such {@link IDataService}, then a single-phase commit will be used.
     * Otherwise a distributed transaction commit protocol will be used.
     * <p>
     * Note: The commits requests are placed into a partial order by sorting the
     * total set of resources which the transaction declares (via this method)
     * across all operations executed by the transaction and then contending for
     * locks on the named resources using a LockManager. This is
     * handled by the {@link DistributedTransactionService}.
     */
    @Override
    public void declareResources(final long tx, final UUID dataServiceUUID,
            final String[] resource) throws IllegalStateException {

        setupLoggingContext();

        lock.lock();
        try {

            switch (getRunState()) {
            case Running:
            case Shutdown:
                break;
            default:
                throw new IllegalStateException(ERR_SERVICE_NOT_AVAIL);
            }

            if (dataServiceUUID == null)
                throw new IllegalArgumentException();

            if (resource == null)
                throw new IllegalArgumentException();

            final TxState state = getTxState(tx);

            if (state == null) {

                throw new IllegalStateException(ERR_NO_SUCH);

            }

            state.lock.lock();

            try {

                if (state.isReadOnly()) {

                    throw new IllegalStateException(ERR_READ_ONLY);

                }

                if (!state.isActive()) {

                    throw new IllegalStateException(ERR_NOT_ACTIVE);

                }

                state.declareResources(dataServiceUUID, resource);

            } finally {

                state.lock.unlock();

            }

        } finally {

            lock.unlock();
            clearLoggingContext();

        }
        
    }

    /**
     * Waits at "prepared" barrier. When the barrier breaks, examing the
     * {@link TxState}. If the transaction is aborted, then throw an
     * {@link InterruptedException}. Otherwise return the commitTime assigned
     * to the transaction.
     * 
     * @throws InterruptedException
     *             if the barrier is reset while the caller is waiting.
     */
    @Override
    public long prepared(final long tx, final UUID dataService)
            throws IOException, InterruptedException, BrokenBarrierException {

        final DistributedTxCommitTask task = commitList.get(tx);
        
        if (task == null) {

            /*
             * Transaction is not committing.
             */
            
            throw new IllegalStateException();
            
        }
        
        final TxState state = task.state;
        
        state.lock.lock();
        
        try {
        
            if(!state.isStartedOn(dataService)) {
                
                throw new IllegalArgumentException();
                
            }
            
            // wait at the 'prepared' barrier.
            task.preparedBarrier.await();

            if (state.isAborted())
                throw new InterruptedException();

            return state.getCommitTime();
            
        } finally {
            
            state.lock.unlock();
            
        }
        
    }
    
    /**
     * Wait at "committed" barrier. When the barrier breaks, examing the
     * {@link TxState}. If the transaction is aborted, then return
     * <code>false</code>. Otherwise return true.
     * <p>
     * Note: The {@link TxState} will be aborted if any of the committers throws
     * an exception of their {@link ITxCommitProtocol#prepare(long, long)}
     * method.
     */
    @Override
    public boolean committed(final long tx, final UUID dataService)
            throws IOException, InterruptedException, BrokenBarrierException {

        final DistributedTxCommitTask task = commitList.get(tx);
        
        if (task == null) {

            /*
             * Transaction is not committing.
             */
            
            throw new IllegalStateException();
            
        }        
        
        final TxState state = task.state;
        
        state.lock.lock();
        
        try {
        
            if(!state.isStartedOn(dataService)) {
                
                throw new IllegalArgumentException();
                
            }
            
            // wait at the 'committed' barrier.
            task.committedBarrier.await();

            if (state.isAborted())
                return false;

            return true;
            
        } finally {
            
            state.lock.unlock();
            
        }
                
    }

    protected long findCommitTime(final long timestamp) {
        
        synchronized(commitTimeIndex) {
            
            return commitTimeIndex.find(timestamp);
            
        }
        
    }

    protected long findNextCommitTime(long commitTime) {
        
        synchronized(commitTimeIndex) {
            
            return commitTimeIndex.findNext(commitTime);
            
        }

    }
    
    /**
     * @todo Is it a problem if the commit notices do not arrive in sequence?
     *       Because they will not. Unisolated operations will participate in
     *       group commits using timestamps obtained from the transaction
     *       service, but those commit operations will not be serialize and
     *       their reporting of the timestamps for the commits will likewise not
     *       be serialized.
     *       <p>
     *       The danger is that we could assign a read-historical transaction
     *       start time based on the {@link #commitTimeIndex} and then have
     *       commit timestamps arrive that are within the interval in which we
     *       made the assignment. Essentially, our interval was too large and
     *       the assigned start time may have been on either side of a
     *       concurrent commit. However, this can only occur for unisolated
     *       operations (non-transactional commits). The selected timestamp will
     *       always be coherent with respect to transaction commits since those
     *       are coordinated and use a shared commit time.
     *       <p>
     *       This issue can only arise when requesting historical reads for
     *       timestamps that are "close" to the most recent commit point since
     *       the latency involve would otherwise not effect the assignment of
     *       transaction start times. However, it can occur either when
     *       specifying the symbolic constant {@link ITx#READ_COMMITTED} to
     *       {@link #newTx(long)} or when specifying the exact commitTime
     *       reported by a transaction commit.
     *       <p>
     *       Simply stated, there is NO protection against concurrently
     *       unisolated operations committing. If such operations are used on
     *       the same indices as transactions, then it IS possible that the
     *       application will be unable to read from exactly the post-commit
     *       state of the transaction for a brief period (10s of milliseconds)
     *       until the unisolated commit notices have been propagated to the
     *       {@link DistributedTransactionService}. This issue will only occur
     *       when there is also a lot of contention for reading on the desired
     *       timestamp since otherwise the commitTime itself may be used as a
     *       transaction start time.
     * 
     * @todo depending on the latency involved and the issue described
     *       immediately above, it might be possible to simply queue these
     *       notices and consume them in an async thread. some operations (such
     *       as a distributed commit) might require that we catch up on the
     *       commit time notices in the queue. just thinking out loud here.
     */
    final public void notifyCommit(final long commitTime) {

        /*
         * Note: In order to avoid a deadlock, this must obtain the lock before
         * synchronizing on the commitTimeIndex since the method in the super
         * class will request that lock as well.
         */
        lock.lock();

        try {

            synchronized (commitTimeIndex) {

                /*
                 * Add all commit times
                 */
                commitTimeIndex.add(commitTime);

                /*
                 * Note: commit time notifications can be overlap such that they
                 * appear out of sequence with respect to their values. This is
                 * Ok. We just ignore any older commit times. However we do need
                 * to be synchronized here such that the commit time notices
                 * themselves are serialized so that we do not miss any.
                 */

                if (log.isDebugEnabled())
                    log.debug("commitTime="
                            + commitTime
                            + ", lastKnownCommitTime="
                            + lastCommitTime
                            + (lastCommitTime < commitTime ? " WILL UPDATE"
                                    : ""));

                if (lastCommitTime < commitTime) {

                    lastCommitTime = commitTime;

                    super.notifyCommit(commitTime);

                }

            }

        } finally {

            lock.unlock();

        }
        
    }
    
    final public long getLastCommitTime() {
        
        return lastCommitTime;
        
    }
    
    /**
     * Invokes {@link ITxCommitProtocol#setReleaseTime(long)} for a specific
     * {@link IDataService}.
     * 
     * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
     */
    private static class SetReleaseTimeTask implements Callable<Void> {

        final IDataService dataService;

        final long releaseTime;

        public SetReleaseTimeTask(final IDataService dataService, final long releaseTime) {

            if (dataService == null)
                throw new IllegalArgumentException();

            if (releaseTime <= 0L)
                throw new IllegalArgumentException();

            this.dataService = dataService;

            this.releaseTime = releaseTime;

        }

        public Void call() throws Exception {

            dataService.setReleaseTime(releaseTime);

            return null;

        }

    }

    /**
     * Task periodically notifies the discovered {@link IDataService}s of the
     * new release time.
     * <p>
     * Note: Running a concurrent instance of this could cause release times to
     * be distributed that do not strictly advance. If you need to do this,
     * e.g., in order to immediately update the release time, then also
     * introduce a lock for this task on the {@link AbstractTransactionService}
     * so that instances of the task must run in sequence.
     * 
     * @todo must also notify the metadata service once it is partitioned.
     * 
     * @todo We could monitor data service joins (for jini) and immediately
     *       notify newly joined data services of the current release time.
     * 
     * FIXME There is probably no reason to do this now that the resource
     * manager reaches out for the current release time before deciding which
     * resources it can release.
     * 
     * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
     */
    protected class NotifyReleaseTimeTask implements Runnable {

        private long lastReleaseTime = 0L;
        
        /**
         * Notifies all {@link IDataService}s of the current release time.
         * <p>
         * Note: An {@link IDataService} WILL NOT release its most current
         * commit point, regardless of the releaseTime that is sent to that
         * service.
         * <p>
         * Note: If this method throws an exception then the task will no longer
         * be scheduled!
         */
        public void run() {

            try {

                final long releaseTime = getReleaseTime();

                if (releaseTime == lastReleaseTime) {

                    // The release time has not been advanced.
                    return;
                    
                }
                
                final IBigdataFederation fed = getFederation();

                final UUID[] a = fed.getDataServiceUUIDs(0/* maxCount */);

                final IDataService[] services = getFederation()
                        .getDataServices(a);

                final List<Callable<Void>> tasks = new ArrayList<Callable<Void>>(
                        a.length);

                for (IDataService dataService : services) {

                    tasks.add(new SetReleaseTimeTask(dataService, releaseTime));

                }

                log.warn("Will set release time on " + a.length
                        + " data services: releaseTime=" + releaseTime);
                
                final List<Future<Void>> futures = getFederation()
                        .getExecutorService().invokeAll(tasks);

                for (Future<Void> f : futures) {

                    try {

                        // verify no errors.
                        f.get();

                    } catch (Throwable t) {

                        /*
                         * Log an error if any data service can not be notified.
                         */

                        log.error(t.getLocalizedMessage(), t);

                    }

                }

                // update the last release time.
                lastReleaseTime = releaseTime;
                
            } catch (Throwable t) {

                log.error(t.getLocalizedMessage(), t);

            }

        }

    }
    
    /**
     * Adds counters for the {@link LockManager}.
     */
//    synchronized 
    public CounterSet getCounters() {

//        if (countersRoot == null) {
            
            /*
             * Setup basic counters.
             */
        final CounterSet countersRoot = super.getCounters();

            /**
             * The lock manager imposing a partial ordering on the prepare phase
             * of distributed transaction commits using the index partition
             * names as the named resources.
             */
            countersRoot.makePath("Index Lock Manager").attach(
                    ((DistributedTransactionService) this).indexLockManager
                            .getCounters());

            /**
             * The lock manager imposing a partial ordering on the commit phase
             * of distributed transaction commits using the data service UUIDs
             * as the named resources.
             */
            countersRoot.makePath("DataService Lock Manager").attach(
                    ((DistributedTransactionService) this).dataServiceLockManager
                                    .getCounters());

            /**
             * The #of snapshots of the commit time index that have been written
             * to date.
             */
            countersRoot.addCounter("snapshotCount",
                    new Instrument<Long>() {
                        protected void sample() {
                            setValue(snapshotCount);
                        }
                    });

            /**
             * The #of distributed transaction commits that are currently in
             * progress.
             */
            countersRoot.addCounter("distributedCommitsInProgressCount",
                    new Instrument<Integer>() {
                        protected void sample() {
                            setValue(commitList.size());
                        }
                    });

            /**
             * The #of commit times that are currently accessible.
             */
            countersRoot.addCounter("commitTimesCount",
                    new Instrument<Long>() {
                        protected void sample() {
                            /*
                             * Note: This uses a method which does not require
                             * synchronization.  (The entryCount is reported
                             * without traversing the BTree.)
                             */
                            setValue(commitTimeIndex.getEntryCount());
//                            synchronized (commitTimeIndex) {
//                                setValue(commitTimeIndex.getEntryCount());
//                            }
                        }
                    });

            countersRoot.addCounter("dataDir", new Instrument<String>() {
                protected void sample() {
                    setValue(dataDir.toString());
                }
            });

//        }

        return countersRoot;

    }

}