/** * Copyright (C) 2009-2015 FoundationDB, LLC * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package com.foundationdb.server.store; import com.foundationdb.ais.model.ForeignKey; import com.foundationdb.server.error.AkibanInternalException; import com.foundationdb.server.error.InvalidOperationException; import com.foundationdb.server.error.LockTimeoutException; import com.foundationdb.server.error.NoTransactionInProgressException; import com.foundationdb.server.error.QueryCanceledException; import com.foundationdb.server.error.TransactionAbortedException; import com.foundationdb.server.error.TransactionInProgressException; import com.foundationdb.server.service.session.Session; import com.foundationdb.server.service.session.Session.Key; import com.foundationdb.server.service.session.Session.StackKey; import com.foundationdb.server.service.transaction.TransactionService; import com.foundationdb.sql.parser.IsolationLevel; import com.foundationdb.util.MultipleCauseException; import com.foundationdb.util.Strings; import com.google.common.primitives.UnsignedBytes; import com.google.inject.Inject; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.Deque; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.ListIterator; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.TreeMap; import java.util.concurrent.Callable; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock; /** * KV storage (via TreeMap<byte[],byte[]) and transaction provider. * * Per-key locking providing repeatable read semantics. No gap locking. * * gets() take read (shared) locks, sets() and clears() take write (exclusive) locks * and shared can be upgraded to exclusive 'optimistically' via tag (StampedLock-ish). * * Uncommitted writes go directly into master KV after (exclusive) lock is acquired and an * undo log is maintained to restore proper state on rollback. * * Lock acquisition has an aggressive (1s) timeout after which a retryable error is thrown. */ public class MemoryTransactionService implements TransactionService { private static final Logger LOG = LoggerFactory.getLogger(MemoryTransactionService.class); private static final int PERIODIC_COMMIT_MILLS = 500; private static final int PERIODIC_COMMIT_BYTES = 100000; private static final int LOCK_TIMEOUT_MILLIS = 1 * 1000; private static final Key<MemoryTransactionImpl> TXN_KEY = Key.named("TXN"); private static final StackKey<Callback> PRE_COMMIT_KEY = StackKey.stackNamed("TXN_PRE_COMMIT"); private static final StackKey<Callback> AFTER_END_KEY = StackKey.stackNamed("TXN_AFTER_END"); private static final StackKey<Callback> AFTER_COMMIT_KEY = StackKey.stackNamed("TXN_AFTER_COMMIT"); private static final StackKey<Callback> AFTER_ROLLBACK_KEY = StackKey .stackNamed("TXN_AFTER_ROLLBACK"); private final ConcurrentMap<BytesHolder,TaggedLock> locks; private final KVMap db; @Inject public MemoryTransactionService() { this.locks = new ConcurrentHashMap<>(); this.db = new KVMap(); } // // Service // @Override public void start() { // None } @Override public void stop() { locks.clear(); synchronized(db) { db.clear(); } } @Override public void crash() { stop(); } // // TransactionService // @Override public boolean isTransactionActive(Session session) { return (session.get(TXN_KEY) != null); } @Override public boolean isRollbackPending(Session session) { return getTransactionInternal(session).isRollbackPending; } @Override public long getTransactionStartTimestamp(Session session) { return getTransactionInternal(session).startMillis; } @Override public void beginTransaction(Session session) { if(isTransactionActive(session)) { throw new TransactionInProgressException(); } MemoryTransactionImpl txn = new MemoryTransactionImpl(session); session.put(TXN_KEY, txn); } @Override public CloseableTransaction beginCloseableTransaction(final Session session) { beginTransaction(session); return new CloseableTransaction() { @Override public void commit() { commitTransaction(session); } @Override public void rollback() { rollbackTransaction(session); } @Override public void close() { rollbackTransactionIfOpen(session); } }; } @Override public void commitTransaction(Session session) { if(isRollbackPending(session)) { throw new TransactionAbortedException(); } commitInternal(session, false, true); } @Override public boolean commitOrRetryTransaction(Session session) { if(isRollbackPending(session)) { throw new TransactionAbortedException(); } return commitInternal(session, true, true); } @Override public void rollbackTransaction(Session session) { MemoryTransactionImpl txn = getTransactionInternal(session); RuntimeException re = null; try { rollbackInternal(session, txn); } catch(RuntimeException e) { re = e; } finally { end(session, txn, true, re); } } @Override public void rollbackTransactionIfOpen(Session session) { if(isTransactionActive(session)) { rollbackTransaction(session); } } @Override public boolean periodicallyCommit(Session session) { MemoryTransactionImpl txn = getTransactionInternal(session); if(txn.isTimeToCommit()) { commitInternal(session, false, false); txn.reset(); return true; } else { return false; } } @Override public boolean shouldPeriodicallyCommit(Session session) { return getTransactionInternal(session).isTimeToCommit(); } @Override public void addCallback(Session session, CallbackType type, Callback callback) { session.push(getCallbackKey(type), callback); } @Override public void addCallbackOnActive(Session session, CallbackType type, Callback callback) { if(!isTransactionActive(session)) { throw new IllegalStateException("Expected active"); } addCallback(session, type, callback); } @Override public void addCallbackOnInactive(Session session, CallbackType type, Callback callback) { if(isTransactionActive(session)) { throw new IllegalStateException("Expected inactive"); } addCallback(session, type, callback); } @Override public void run(Session session, final Runnable runnable) { run(session, new Callable<Void>() { @Override public Void call() throws Exception { runnable.run(); return null; } }); } @Override public <T> T run(Session session, Callable<T> callable) { for(int tries = 1; ; ++tries) { try { beginTransaction(session); T value = callable.call(); commitTransaction(session); return value; } catch(InvalidOperationException e) { if(!e.getCode().isRollbackClass()) { throw e; } // Back-off? LOG.debug("Retrying callable, attempt {}", tries); } catch(RuntimeException e) { throw e; } catch(Exception e) { throw new AkibanInternalException("Unexpected Exception", e); } finally { rollbackTransactionIfOpen(session); } } } @Override public void setSessionOption(Session session, SessionOption option, String value) { // None } @Override public int markForCheck(Session session) { return -1; } @Override public boolean checkSucceeded(Session session, Exception retryException, int sessionCounter) { return false; } @Override public void setDeferredForeignKey(Session session, ForeignKey foreignKey, boolean deferred) { MemoryTransactionImpl txn = getTransactionInternal(session); txn.deferredForeignKeys = ForeignKey.setDeferred(txn.deferredForeignKeys, foreignKey, deferred); } @Override public void checkStatementConstraints(Session session) { MemoryTransactionImpl txn = getTransactionInternal(session); if(txn.pendingChecks != null) { txn.pendingChecks.performChecks(session, txn, MemoryIndexChecks.CheckPass.STATEMENT); } } @Override public boolean getForceImmediateForeignKeyCheck(Session session) { // Only called via Online DDL throw new UnsupportedOperationException(); } @Override public boolean setForceImmediateForeignKeyCheck(Session session, boolean force) { // Only called via Online DDL throw new UnsupportedOperationException(); } @Override public IsolationLevel actualIsolationLevel(IsolationLevel level) { return IsolationLevel.REPEATABLE_READ_ISOLATION_LEVEL; } @Override public IsolationLevel setIsolationLevel(Session session, IsolationLevel level) { // Ignored. return IsolationLevel.REPEATABLE_READ_ISOLATION_LEVEL; } @Override public boolean isolationLevelRequiresReadOnly(Session session, boolean commitNow) { return false; } // // TreeMapTransactionImplService // public void addPendingCheck(Session session, MemoryIndexChecks.IndexCheck check) { MemoryTransactionImpl txn = getTransactionInternal(session); if(txn.pendingChecks == null) { txn.pendingChecks = new MemoryIndexChecks.PendingChecks(); } txn.pendingChecks.add(session, txn, check); } public MemoryTransaction getTransaction(Session session) { return getTransactionInternal(session); } public boolean isDeferred(Session session, ForeignKey foreignKey) { MemoryTransactionImpl txn = getTransactionInternal(session); return foreignKey.isDeferred(txn.deferredForeignKeys); } public void setRollbackPending(Session session) { MemoryTransactionImpl txn = getTransactionInternal(session); txn.isRollbackPending = true; } // // Static // /** Wrapper class providing equals() and hashCode() for byte[] */ private static class BytesHolder { public final byte[] bytes; private BytesHolder(byte[] bytes) { assert bytes != null; this.bytes = bytes; } @Override public boolean equals(Object o) { if(this == o) { return true; } if((o == null) || (getClass() != o.getClass())) { return false; } BytesHolder that = (BytesHolder)o; return Arrays.equals(bytes, that.bytes); } @Override public int hashCode() { return Arrays.hashCode(bytes); } } private static class CopiedEntry implements Entry<byte[], byte[]> { private final byte[] key; private final byte[] value; public CopiedEntry(Entry<byte[], byte[]> entry) { this.key = copy(entry.getKey()); this.value = copy(entry.getValue()); } @Override public byte[] getKey() { return key; } @Override public byte[] getValue() { return value; } @Override public byte[] setValue(byte[] value) { throw new UnsupportedOperationException(); } @Override public String toString() { return Strings.hex(key) + "=" + Strings.hex(value); } } private static class KVMap extends TreeMap<byte[], byte[]> { private static final Comparator<byte[]> COMPARATOR = UnsignedBytes.lexicographicalComparator(); private static final Comparator<byte[]> REVERSE_COMPARATOR = new Comparator<byte[]>() { @Override public int compare(byte[] o1, byte[] o2) { return COMPARATOR.compare(o2, o1); } }; public KVMap() { this(Collections.<byte[],byte[]>emptyMap(), false); } public KVMap(Map<byte[], byte[]> map, boolean reverse) { super(reverse ? REVERSE_COMPARATOR : COMPARATOR); putAll(map); } } private static class TaggedLock { private final ReadWriteLock rwLock; // ReadWriteLock does not support upgrading read to write. // Tag is checked pre-read.unlock() and must match post-write.lock(). private final AtomicLong tag; public TaggedLock() { this.rwLock = new ReentrantReadWriteLock(true); this.tag = new AtomicLong(0); } public void readUnlock() { rwLock.readLock().unlock(); } public void writeUnlock() { rwLock.writeLock().unlock(); } public void readLock(Session session) { tryLock(session, rwLock.readLock(), "shared"); } public void writeLock(Session session) { tryLock(session, rwLock.writeLock(), "exclusive"); tag.incrementAndGet(); } public void upgradeLock(Session session) { final long saveTag = tag.get(); readUnlock(); tryLock(session, rwLock.writeLock(), "upgrade"); if(saveTag != tag.get()) { writeUnlock(); throwTimeout(session, "optimistic upgrade"); } tag.incrementAndGet(); } private static void tryLock(Session session, Lock lock, String lockType) { try { if(!lock.tryLock(LOCK_TIMEOUT_MILLIS, TimeUnit.MILLISECONDS)) { throwTimeout(session, lockType); } } catch(InterruptedException e) { throw new QueryCanceledException(session); } } private static void throwTimeout(Session session, String lockType) { LOG.trace("lock timeout: {}", lockType); // No telling where this happened, e.g. row could be half written. getTransactionInternal(session).isRollbackPending = true; throw new LockTimeoutException(LOCK_TIMEOUT_MILLIS, lockType, MemoryTransactionService.class.getSimpleName()); } } private class MemoryTransactionImpl implements MemoryTransaction { final Session session; final List<UndoOp> undoLog; final Set<BytesHolder> readLocked; final Set<BytesHolder> writeLocked; long startMillis; long commitMillis; long bytesWritten; boolean isRollbackPending; Map<ForeignKey,Boolean> deferredForeignKeys; MemoryIndexChecks.PendingChecks pendingChecks; private MemoryTransactionImpl(Session session) { this.session = session; this.undoLog = new ArrayList<>(); this.readLocked = new HashSet<>(); this.writeLocked = new HashSet<>(); reset(); } private void readLock(byte[] rawKey) { BytesHolder key = new BytesHolder(rawKey); if(readLocked.contains(key) || writeLocked.contains(key)) { // Already locked return; } TaggedLock lock = locks.get(key); if(lock == null) { lock = new TaggedLock(); TaggedLock prevLock = locks.putIfAbsent(key, lock); if(prevLock != null) { lock = prevLock; } } lock.readLock(session); readLocked.add(key); } private void writeLock(byte[] rawKey) { BytesHolder key = new BytesHolder(rawKey); if(writeLocked.contains(key)) { // Already locked return; } TaggedLock lock = locks.get(key); if(lock == null) { lock = new TaggedLock(); TaggedLock prevLock = locks.putIfAbsent(key, lock); if(prevLock != null) { lock = prevLock; } } if(readLocked.contains(key)) { readLocked.remove(key); lock.upgradeLock(session); } else { lock.writeLock(session); } writeLocked.add(key); } public boolean isTimeToCommit() { if(bytesWritten > PERIODIC_COMMIT_BYTES) { return true; } if((System.currentTimeMillis() - startMillis) > PERIODIC_COMMIT_MILLS) { return true; } return false; } public void reset() { assert undoLog.isEmpty(); assert readLocked.isEmpty(); assert writeLocked.isEmpty(); startMillis = System.currentTimeMillis(); commitMillis = -1; bytesWritten = 0; isRollbackPending = false; if(deferredForeignKeys != null) { deferredForeignKeys.clear(); } if(pendingChecks != null) { pendingChecks.clear(); } } public void runUndo() { if(undoLog.isEmpty()) { return; } // Undo in reverse order than applied ListIterator<UndoOp> it = undoLog.listIterator(undoLog.size()); synchronized(db) { while(it.hasPrevious()) { UndoOp op = it.previous(); op.apply(db); } } undoLog.clear(); } public void runUnlock() { // Release all locks for(BytesHolder key : readLocked) { TaggedLock lock = locks.get(key); lock.readUnlock(); } readLocked.clear(); for(BytesHolder key : writeLocked) { TaggedLock lock = locks.get(key); lock.writeUnlock(); } writeLocked.clear(); } // // TreeMapTransaction // @Override public byte[] get(byte[] key) { readLock(key); byte[] value; synchronized(db) { value = db.get(key); } return copy(value); } @Override public byte[] getUncommitted(byte[] key) { // No lock byte[] value; synchronized(db) { value = db.get(key); } return copy(value); } @Override public Iterator<Entry<byte[], byte[]>> getRange(byte[] beginKey, final byte[] endKey) { return getRange(beginKey, endKey, false); } @Override public Iterator<Entry<byte[], byte[]>> getRange(byte[] beginKey, byte[] endKey, boolean reverse) { // Duplicate as some consumers want to iterate while calling set/clear final KVMap subMap; synchronized(db) { subMap = new KVMap(db.subMap(beginKey, endKey), reverse); } for(byte[] key : subMap.keySet()) { readLock(key); } final Iterator<Entry<byte[], byte[]>> it = subMap.entrySet().iterator(); return new Iterator<Entry<byte[], byte[]>>() { @Override public boolean hasNext() { return it.hasNext(); } @Override public Entry<byte[], byte[]> next() { Entry<byte[], byte[]> entry = it.next(); return new CopiedEntry(entry); } @Override public void remove() { throw new UnsupportedOperationException(); } }; } @Override public void set(byte[] key, byte[] value) { writeLock(key); byte[] k = copy(key); byte[] v = copy(value); byte[] prev; synchronized(db) { prev = db.put(k, v); } bytesWritten += key.length; bytesWritten += value.length; undoLog.add(new UndoOp(k, prev)); } @Override public void clear(byte[] key) { writeLock(key); byte[] prev; synchronized(db) { prev = db.remove(key); } bytesWritten += key.length; undoLog.add(new UndoOp(copy(key), prev)); } @Override public void clearRange(byte[] beginKey, byte[] endKey) { KVMap subMap; synchronized(db) { subMap = new KVMap(db.subMap(beginKey, endKey), false); } for(Entry<byte[], byte[]> entry : subMap.entrySet()) { writeLock(entry.getKey()); undoLog.add(new UndoOp(entry.getKey(), entry.getValue())); bytesWritten += entry.getKey().length; } synchronized(db) { for(byte[] key : subMap.keySet()) { db.remove(key); } } } } private static class UndoOp { private final byte[] key; private final byte[] value; private UndoOp(byte[] key, byte[] value) { this.key = key; this.value = value; } public void apply(KVMap map) { if(value == null) { map.remove(key); } else { map.put(key, value); } } @Override public String toString() { return "UndoOp(" + Strings.hex(key) + "=" + (value == null ? null : Strings.hex(value)) + ")"; } } private static void clearStack(Session session, Session.StackKey<Callback> key) { Deque<Callback> stack = session.get(key); if(stack != null) { stack.clear(); } } private static boolean commitInternal(Session session, boolean allowRetry, boolean clearState) { MemoryTransactionImpl txn = getTransactionInternal(session); boolean shouldRetry = false; RuntimeException re = null; try { if(txn.pendingChecks != null) { txn.pendingChecks.performChecks(session, txn, MemoryIndexChecks.CheckPass.TRANSACTION); } runCallbacks(session, PRE_COMMIT_KEY, txn.startMillis, null); // Not much to do, locks are cleared in end(). txn.undoLog.clear(); txn.commitMillis = System.currentTimeMillis(); runCallbacks(session, AFTER_COMMIT_KEY, txn.commitMillis, null); } catch(RuntimeException e1) { try { rollbackInternal(session, txn); // Only retryable exception from this store if(allowRetry && (e1 instanceof LockTimeoutException)) { clearState = false; shouldRetry = true; } else { re = e1; } } catch(RuntimeException e2) { re = e2; } } finally { end(session, txn, clearState, re); } return shouldRetry; } private static byte[] copy(byte[] bytes) { return (bytes == null) ? null : Arrays.copyOf(bytes, bytes.length); } private static void end(Session session, MemoryTransactionImpl txn, boolean clearState, RuntimeException cause) { RuntimeException re = cause; try { assert session.get(TXN_KEY) == txn; if(clearState) { session.remove(TXN_KEY); } txn.runUnlock(); } catch(RuntimeException e) { re = MultipleCauseException.combine(re, e); } finally { clearStack(session, PRE_COMMIT_KEY); clearStack(session, AFTER_COMMIT_KEY); clearStack(session, AFTER_ROLLBACK_KEY); runCallbacks(session, AFTER_END_KEY, -1, re); } } private static Session.StackKey<Callback> getCallbackKey(CallbackType type) { switch(type) { case PRE_COMMIT: return PRE_COMMIT_KEY; case COMMIT: return AFTER_COMMIT_KEY; case ROLLBACK: return AFTER_ROLLBACK_KEY; case END: return AFTER_END_KEY; } throw new IllegalArgumentException(String.valueOf(type)); } private static MemoryTransactionImpl getTransactionInternal(Session session) { MemoryTransactionImpl txn = session.get(TXN_KEY); if(txn == null) { throw new NoTransactionInProgressException(); } return txn; } private static void rollbackInternal(Session session, MemoryTransactionImpl txn) { txn.runUndo(); runCallbacks(session, AFTER_ROLLBACK_KEY, -1, null); } private static void runCallbacks(Session session, Session.StackKey<Callback> key, long timestamp, RuntimeException cause) { RuntimeException exceptions = cause; Callback cb; while((cb = session.pop(key)) != null) { try { cb.run(session, timestamp); } catch(RuntimeException e) { exceptions = MultipleCauseException.combine(exceptions, e); } } if(exceptions != null) { throw exceptions; } } }