/* $HeadURL:: $ * $Id$ * * Copyright (c) 2009-2010 DuraSpace * http://duraspace.org * * In collaboration with Topaz Inc. * http://www.topazproject.org * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.akubraproject.txn.derby; import java.io.IOException; import java.net.URI; import java.sql.Connection; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; import java.sql.PreparedStatement; import java.util.Collections; import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.Set; import javax.sql.XAConnection; import javax.transaction.Transaction; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.derby.jdbc.EmbeddedXADataSource; import org.apache.derby.tools.sysinfo; import org.akubraproject.BlobStore; import org.akubraproject.BlobStoreConnection; import org.akubraproject.txn.AbstractTransactionalStore; /** * A simple transactional store using Derby db for the transaction logging and id mappings. It * provides snapshot isolation with fail-fast semantics, meaning it will immediately throw a * {@link org.akubraproject.txn.ConcurrentBlobUpdateException ConcurrentBlobUpdateException} * if a transaction tries to modify (insert, delete, or overwrite) a blob which was modified by * another transaction since the start of the first transaction (even if the change by the other * transaction hasn't been committed yet). The assumption is that rollbacks are rare and that it is * better to be notified of a conflict immediately rather than wasting time uploading large amounts * of data that will just have to be deleted again. * * <p>In general a transaction must be considered failed and should be rolled back after any * exception occurred. * * <p>This store must be configured with exactly one underlying blob-store. It supports arbitrary * application-ids and maps them to the underlying blob-store's id's; it currently requires that * the underlying blob-store be capable of generating ids. * * <p>Snapshot isolation is implemented using a MVCC design as follows. A name-map holds a list of * versioned id mappings which maps application-ids to underlying store-ids; in addition, each * mapping has two flags indicating whether the mapping has been deleted and whether it has been * committed. When a transaction starts it is given a read version number (these increase * monotonically); only committed map entries with a version less than this read version or * uncommitted entries with a version the same as the read version will be read; if there are * multiple such entries for a given app-id, then the one with the highest version is used. If the * transaction makes a change (adding, removing, replacing, etc), a new entry in recorded in the * map with the version set to the read-version and with the committed flag set to false. On commit * the transaction is assigned a write version number (which is higher than any previously issued * read version numbers) and which it then sets on all entries written as part of this transaction; * it also sets the committed flag to true on these entries. * * <p>Old entries (and the underlying blobs) are cleaned out as they become unreferenced, i.e. when * no active transaction could refer to them anymore. In order to speed up the discovery of such * entries, a separate deleted-list is kept into which an entry is made each time an entry in the * main map is marked as deleted and each time a blob is marked as deleted. This list is processed * at the end of every transaction and upon startup (on startup the list is completely cleared as * there are no active transactions). * * <p><em>A note on locking</em>: Derby, even in read-uncommitted mode, likes to acquire exclusive * locks on rows when doing inserts, deletes, and updates. This would be ok, except that it * sometimes attempts to lock rows it won't change. This can lead to deadlocks. The way around this * that I've found is to ensure Derby always uses an index when searching for the rows to update or * delete. This is accomplished by giving the optimizer explicit instructions via the * <var>DERBY-PROPERTIES</var> directive in the queries. Since this directive is only supported in * select statements, all updates and deletes are done via updatable queries (result-sets). This * actually performs about the same as a direct update or delete statement. See also the thread <a * href="http://mail-archives.apache.org/mod_mbox/db-derby-user/200903.mbox/%3c20090330092451.GD26813@innovation.ch%3e">disabling locking</a> (<a * href="http://mail-archives.apache.org/mod_mbox/db-derby-user/200904.mbox/%3c20090401001750.GB5281@innovation.ch%3e">continued</a>), * or at <a href="http://news.gmane.org/find-root.php?message_id=%3c20090330092451.GD26813%40innovation.ch%3e">gmane</a>. * Unfortunately, however, this does not seem to be sufficient: Derby may still lock other rows, as * documented in <a * href="http://db.apache.org/derby/docs/10.4/devguide/rdevconcepts8424.html">Scope of locks</a> * in Derbys's developer guide. When this happens, the wait for the lock will eventually time out * and an exception will be thrown. However, I have not enountered this issue so far. But a related * issue is present in 10.4 and earlier, namely <a * href="https://issues.apache.org/jira/browse/DERBY-2991">DERBY-2991</a>; testing with 10.5 * indicates this issue has been resolved. For these reasons a flag is provided to restrict the * number of concurrent write-transactions to one, and the * {@link #TransactionalStore(URI, BlobStore, String) three-argument-constructor} will set this * single-writer flag to true for derby 10.4 and earlier. * * @author Ronald Tschalär */ public class TransactionalStore extends AbstractTransactionalStore { /** The SQL table used by this store to hold the name mappings */ public static final String NAME_TABLE = "NAME_MAP"; /** The SQL table used by this store to hold the list of deleted blobs */ public static final String DEL_TABLE = "DELETED_LIST"; private static final Logger logger = LoggerFactory.getLogger(TransactionalStore.class); private final EmbeddedXADataSource dataSource; private final Set<Long> activeTxns = new HashSet<Long>(); private final Set<URI> uriLocks = new HashSet<URI>(); private final boolean singleWriter; private long nextVersion; private long writeVersion = -1; private long writeLockHolder = -1; private boolean purgeInProgress = false; private int numPurgesDelayed = 0; private boolean started = false; /** * Create a new transactional store. The single-writer flag will be determined automatically * depending on the version of derby being used. * * @param id the id of this store * @param wrappedStore the wrapped non-transactional store * @param dbDir the directory to use to store the transaction information * @throws IOException if there was an error initializing the db */ public TransactionalStore(URI id, BlobStore wrappedStore, String dbDir) throws IOException { this(id, wrappedStore, dbDir, needSingleWriter()); } private static boolean needSingleWriter() { return sysinfo.getMajorVersion() < 10 || sysinfo.getMajorVersion() == 10 && sysinfo.getMinorVersion() < 5; } /** * Create a new transactional store. * * @param id the id of this store * @param wrappedStore the wrapped non-transactional store * @param dbDir the directory to use to store the transaction information * @param singleWriter if true, serialize all writers to avoid all locking issues with * Derby; if false, some transactions may fail sometimes due to * locks timing out * @throws IOException if there was an error initializing the db */ public TransactionalStore(URI id, BlobStore wrappedStore, String dbDir, boolean singleWriter) throws IOException { super(id, wrappedStore); this.singleWriter = singleWriter; //TODO: redirect logging to logger //System.setProperty("derby.stream.error.logSeverityLevel", "50000"); //System.setProperty("derby.stream.error.file", new File(base, "derby.log").toString()); //System.setProperty("derby.language.logStatementText", "true"); //System.setProperty("derby.stream.error.method", "java.sql.DriverManager.getLogStream"); dataSource = new EmbeddedXADataSource(); dataSource.setDatabaseName(dbDir); dataSource.setCreateDatabase("create"); Runtime.getRuntime().addShutdownHook(new Thread() { @Override public void run() { try { dataSource.setShutdownDatabase("shutdown"); dataSource.getXAConnection().getConnection(); } catch (Exception e) { logger.warn("Error shutting down derby", e); } } }); createTables(); nextVersion = findYoungestVersion() + 1; logger.info("TransactionalStore started: dbDir='" + dbDir + "', version=" + nextVersion); } private void createTables() throws IOException { runInCon(new Action<Void>() { public Void run(Connection con) throws SQLException { // test if table exists ResultSet rs = con.getMetaData().getTables(null, null, NAME_TABLE, null); try { if (rs.next()) return null; } finally { rs.close(); } // nope, so create it logger.info("Creating tables and indexes for name-map"); Statement stmt = con.createStatement(); try { stmt.execute("CREATE TABLE " + NAME_TABLE + " (appId VARCHAR(1000) NOT NULL, storeId VARCHAR(1000) NOT NULL, " + " version BIGINT NOT NULL, deleted SMALLINT, committed SMALLINT)"); stmt.execute("CREATE INDEX " + NAME_TABLE + "_AIIDX ON " + NAME_TABLE + "(appId)"); stmt.execute("CREATE INDEX " + NAME_TABLE + "_VIDX ON " + NAME_TABLE + "(version)"); stmt.execute("CREATE TABLE " + DEL_TABLE + " (appId VARCHAR(1000) NOT NULL, " + " storeId VARCHAR(1000), version BIGINT NOT NULL)"); stmt.execute("CREATE INDEX " + DEL_TABLE + "_VIDX ON " + DEL_TABLE + "(version)"); // ensure Derby never uses table-locks, only row-locks stmt.execute( "CALL SYSCS_UTIL.SYSCS_SET_DATABASE_PROPERTY('derby.locks.escalationThreshold', '" + Integer.MAX_VALUE + "')"); // we should really never be waiting for a lock let alone deadlock, but just in case stmt.execute( "CALL SYSCS_UTIL.SYSCS_SET_DATABASE_PROPERTY('derby.locks.deadlockTimeout', '30')"); } finally { stmt.close(); } return null; } }, "Failed to create tables"); } private long findYoungestVersion() throws IOException { return runInCon(new Action<Long>() { public Long run(Connection con) throws SQLException { Statement stmt = con.createStatement(); try { stmt.setMaxRows(1); ResultSet rs = // NOPMD stmt.executeQuery("SELECT version FROM " + NAME_TABLE + " ORDER BY version DESC"); return rs.next() ? rs.getLong(1) : -1L; } finally { stmt.close(); } } }, "Failed to find youngest version"); } /** * @throws IllegalStateException if no backing store has been set yet */ @Override public BlobStoreConnection openConnection(Transaction tx, Map<String, String> hints) throws IllegalStateException, IOException { long version; synchronized (this) { if (!started) { started = true; purgeOldVersions(0); } while (writeVersion >= 0 && nextVersion == writeVersion) { if (logger.isDebugEnabled()) logger.debug("Out of available versions - waiting for write-lock to be released"); try { wait(); } catch (InterruptedException ie) { throw new IOException("wait for write-lock interrupted", ie); } } version = nextVersion++; boolean isNew = activeTxns.add(version); assert isNew : "duplicate version " + version; } boolean ok = false; try { XAConnection xaCon; Connection con; synchronized (dataSource) { xaCon = dataSource.getXAConnection(); con = xaCon.getConnection(); } con.setTransactionIsolation(Connection.TRANSACTION_READ_UNCOMMITTED); tx.enlistResource(xaCon.getXAResource()); BlobStoreConnection bsc = new TransactionalConnection(this, wrappedStore, xaCon, con, tx, hints, version); if (logger.isDebugEnabled()) logger.debug("Opened connection, read-version=" + version); ok = true; return bsc; } catch (IOException ioe) { throw ioe; } catch (Exception e) { throw new IOException("Error connecting to db", e); } finally { if (!ok) { synchronized (this) { activeTxns.remove(version); } } } } boolean singleWriter() { return singleWriter; } /** * Acquire the write lock. This is a simple, re-entrant lock without a lock count. If the lock * is already held this will block until it is free. * * @param version the version acquiring the lock * @throws InterruptedException if waiting for the lock was interrupted */ synchronized void acquireWriteLock(long version) throws InterruptedException { while (writeLockHolder >= 0 && writeLockHolder != version) wait(); if (logger.isTraceEnabled()) logger.trace("Transaction " + version + " acquired write lock"); writeLockHolder = version; } /** * Release the write lock. This always completely releases lock no matter how often {@link * #acquireWriteLock} was invoked. * * @param version the version that acquired the lock * @throws IllegalStateException if the lock is not held by <var>version</var> */ synchronized void releaseWriteLock(long version) { if (writeLockHolder != version) throw new IllegalStateException("Connection '" + version + "' is not the holder of the " + "write lock; '" + writeLockHolder + "' is"); if (logger.isTraceEnabled()) logger.trace("Transaction " + version + " released write lock"); writeLockHolder = -1; notifyAll(); } /** * Acquire a lock on the given URI. Each lock for each URI is a simple, non-reentrant lock and * each lock for each URI is independent of the others. If the lock is already held this will * block until it is free. * * @param uri the URI for which to acquire the lock * @throws InterruptedException if waiting for the lock was interrupted */ void acquireUriLock(URI uri) throws InterruptedException { synchronized (uriLocks) { while (uriLocks.contains(uri)) uriLocks.wait(); uriLocks.add(uri); } } /** * Release the lock on the given URI. * * @param uri the URI for which to release the lock * @throws IllegalStateException if the lock was not held */ void releaseUriLock(URI uri) throws IllegalStateException { synchronized (uriLocks) { if (!uriLocks.remove(uri)) throw new IllegalStateException("Uri lock for <" + uri + "> was not held"); uriLocks.notifyAll(); } } /** * Prepare the transaction. This acquires the write-lock and hence must always be followed by * {@link #txnComplete} to release it. * * @param numMods the number of modifications made during this transaction; this is used * to estimate how long the commit might take * @param version the transaction's read-version - used for logging * @return the write version * @throws InterruptedException if interrupted while waiting for the write-lock */ synchronized long txnPrepare(int numMods, long version) throws InterruptedException { if (logger.isDebugEnabled()) logger.debug("Preparing transaction " + version); acquireWriteLock(version); /* Leave a little space in the version number sequence so other transactions may start while * this one completes. The constant '1/100' is pulled out of thin air, and represents a guess * on the upper bound on how many transactions are likely to be started during the time it * takes this one to complete; if it is too large then we just have larger holes and the * transaction numbers jump more than necessary, which isn't tragic as long as the jumps are * not so large that we run into a real possibility of version number wrap-around; if it is too * small then that just means transactions may be needlessly held up waiting for this one to * complete. Also, we always leave a little extra room to account for the fact that there's a * semi-fixed overhead that a commit will take even if there are only a few changes. */ writeVersion = Math.max(nextVersion + numMods / 100, 10); if (logger.isDebugEnabled()) logger.debug("Prepared transaction " + version + ", write-version=" + writeVersion); return writeVersion; } /** * Signal that the transaction is complete. This must always be invoked. * * @param committed whether the transaction was committed or rolled back * @param version the transaction's read-version */ synchronized void txnComplete(boolean committed, long version) { if (logger.isDebugEnabled()) logger.debug("Transaction " + version + " completed " + (committed ? "(committed)" : "(rolled back)")); boolean wasActive = activeTxns.remove(version); assert wasActive : "completed unknown transaction " + version + (committed ? "(committed)" : "(rolled back)"); if (writeLockHolder != version) return; // never prepared (e.g. r/o txn, or rollback) if (committed && writeVersion >= 0) nextVersion = writeVersion + 1; writeVersion = -1; releaseWriteLock(version); } /** * Purge all old versions that are not being used anymore. * * @param lastCompletedVersion the version of the recently completed transaction; if there are * other, older transactions still active then the purge can be * avoided, i.e. this is just for optimization. */ void purgeOldVersions(long lastCompletedVersion) { final long minVers; synchronized (this) { minVers = activeTxns.isEmpty() ? nextVersion : Collections.min(activeTxns); if (minVers < lastCompletedVersion) return; // we didn't release anything /* Derby has issues trying to run multiple purges in parallel (NPE's, waiting for * locks that should not be held by anybody, and even deadlocks). Also, there isn't * that much point in running multiple purges simultaneously, as the next purge * will clean up stuff too. * * However, just short-circuiting here if a purge is already in progress can cause * the purging to fall seriously behind under load (in a sort of negative feedback * loop: the more it falls behind, the longer it takes to catch up, the more it * falls behind, ...). Hence we keep track of how many times we've skipped the * purge and after some threshhold we start blocking to let the purge catch up. */ while (purgeInProgress) { if (numPurgesDelayed < 10) { numPurgesDelayed++; return; } try { wait(); } catch (InterruptedException ie) { throw new RuntimeException("Interrupted waiting for purge lock", ie); } } purgeInProgress = true; numPurgesDelayed = 0; } try { if (singleWriter) acquireWriteLock(lastCompletedVersion); runInCon(new Action<Void>() { public Void run(Connection con) throws SQLException { if (logger.isDebugEnabled()) logger.debug("Purging deleted blobs older than revision " + minVers); // clean out stale mapping entries PreparedStatement findOld = con.prepareStatement( "SELECT appId, version FROM " + DEL_TABLE + " WHERE version < ?"); findOld.setLong(1, minVers); ResultSet rs = findOld.executeQuery(); // NOPMD int cntM = 0; try { if (!rs.next()) return null; PreparedStatement purge = con.prepareStatement( "SELECT version FROM " + NAME_TABLE + " -- DERBY-PROPERTIES index=NAME_MAP_AIIDX \n" + " WHERE appId = ? AND (version < ? OR version = ? AND deleted <> 0)", ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_UPDATABLE); do { purge.setString(1, rs.getString(1)); purge.setLong(2, rs.getLong(2)); purge.setLong(3, rs.getLong(2)); ResultSet rs2 = purge.executeQuery(); // NOPMD try { while (rs2.next()) { cntM++; rs2.deleteRow(); } } finally { rs2.close(); } } while (rs.next()); purge.close(); } finally { try { rs.close(); } finally { findOld.close(); } } // remove unreferenced blobs findOld = con.prepareStatement( "SELECT storeId FROM " + DEL_TABLE + " WHERE version < ? AND storeId IS NOT NULL"); findOld.setLong(1, minVers); rs = findOld.executeQuery(); int cntB = 0; try { BlobStoreConnection bsc = wrappedStore.openConnection(null, null); try { while (rs.next()) { cntB++; String storeId = rs.getString(1); if (logger.isTraceEnabled()) logger.trace("Purging deleted blob '" + storeId + "'"); try { bsc.getBlob(URI.create(storeId), null).delete(); } catch (IOException ioe) { logger.warn("Error purging blob '" + storeId + "'", ioe); } } } finally { bsc.close(); } } catch (IOException ioe) { logger.warn("Error opening connection to underlying store to purge old versions", ioe); } finally { try { rs.close(); } finally { findOld.close(); } } // purge processed entries from the delete table String sql = "SELECT version FROM " + DEL_TABLE + " -- DERBY-PROPERTIES index=DELETED_LIST_VIDX \n WHERE version < ?"; PreparedStatement purge = con.prepareStatement(sql, ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_UPDATABLE); purge.setLong(1, minVers); rs = purge.executeQuery(); int cntD = 0; try { while (rs.next()) { cntD++; rs.deleteRow(); } } finally { try { rs.close(); } finally { purge.close(); } } // debug log the stats try { int cntL = 0; if (logger.isTraceEnabled()) { BlobStoreConnection bsc = wrappedStore.openConnection(null, null); for (Iterator<URI> iter = bsc.listBlobIds(null); iter.hasNext(); iter.next()) cntL++; bsc.close(); } if (logger.isDebugEnabled()) logger.debug("purged: " + cntM + " mappings, " + cntB + " blobs, " + cntD + " deletes" + (logger.isTraceEnabled() ? "; " + cntL + " blobs left" : "")); } catch (Exception e) { e.printStackTrace(); } return null; } }, "Error purging old versions"); } catch (Exception e) { logger.warn("Error purging old versions", e); } finally { try { if (singleWriter) releaseWriteLock(lastCompletedVersion); } finally { synchronized (this) { purgeInProgress = false; notifyAll(); } } } } private <T> T runInCon(Action<T> action, String errMsg) throws IOException { try { XAConnection xaCon; Connection con; synchronized (dataSource) { xaCon = dataSource.getXAConnection(); con = xaCon.getConnection(); } con.setTransactionIsolation(Connection.TRANSACTION_READ_UNCOMMITTED); con.setAutoCommit(false); boolean committed = false; try { T res = action.run(con); con.commit(); committed = true; return res; } finally { if (!committed) { try { con.rollback(); } catch (SQLException sqle) { logger.error("Error rolling back after failure", sqle); } } xaCon.close(); } } catch (SQLException sqle) { throw new IOException(errMsg, sqle); } } private static interface Action<T> { public T run(Connection con) throws SQLException; } }