// Copyright 2017 JanusGraph Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package org.janusgraph.diskstorage.locking.consistentkey;
import com.google.common.base.Function;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterables;
import org.janusgraph.core.JanusGraphConfigurationException;
import org.janusgraph.diskstorage.configuration.ConfigElement;
import org.janusgraph.diskstorage.util.time.Timer;
import org.janusgraph.diskstorage.util.time.TimestampProvider;
import org.janusgraph.diskstorage.*;
import org.janusgraph.diskstorage.configuration.Configuration;
import org.janusgraph.diskstorage.keycolumnvalue.*;
import org.janusgraph.diskstorage.locking.*;
import org.janusgraph.diskstorage.util.*;
import org.janusgraph.graphdb.configuration.GraphDatabaseConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.time.Duration;
import java.time.Instant;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* A global {@link Locker} that resolves inter-thread lock contention via
* {@link AbstractLocker} and resolves inter-process contention by reading and
* writing lock data using {@link KeyColumnValueStore}.
* <p/>
* <h2>Protocol and internals</h2>
* <p/>
* Locking is done in two stages: first between threads inside a shared process,
* and then between processes in a JanusGraph cluster.
* <p/>
* <h3>Inter-thread lock contention</h3>
* <p/>
* Lock contention between transactions within a shared process is arbitrated by
* the {@code LocalLockMediator} class. This mediator uses standard
* {@code java.util.concurrent} classes to guarantee that at most one thread
* holds a lock on any given {@link KeyColumn} at any given time. The code that
* uses a mediator to resolve inter-thread lock contention is common to multiple
* {@code Locker} implementations and lives in the abstract base class
* {@link AbstractLocker}.
* <p/>
* However, the mediator has no way to perform inter-process communication. The
* mediator can't detect or prevent a thread in another process (potentially on
* different machine) acquiring the same lock. This is addressed in the next
* section.
* <p/>
* <h3>Inter-process lock contention</h3>
* <p/>
* After the mediator signals that the current transaction has obtained a lock
* at the inter-thread/intra-process level, this implementation does the
* following series of writes and reads to {@code KeyColumnValueStore} to check
* whether it is the only process that holds the lock. These Cassandra
* operations go to a dedicated store holding nothing but locking data (a
* "store" in this context means a Cassandra column family, an HBase table,
* etc.)
* <p/>
* <h4>Locking I/O sequence</h4>
* <p/>
* <ol>
* <li>Write a single column to the store with the following data
* <dl>
* <dt>key</dt>
* <dd>{@link KeyColumn#getKey()} followed by {@link KeyColumn#getColumn()}.</dd>
* <dt>column</dt>
* <dd>the approximate current timestamp in nanoseconds followed by this
* process's {@code rid} (an opaque identifier which uniquely identifie
* this process either globally or at least within the JanusGraph cluster)</dd>
* <dt>value</dt>
* <dd>the single byte 0; this is unused but reserved for future use</dd>
* </dl>
* </li>
* <p/>
* <li>If the write failed or took longer than {@code lockWait} to complete
* successfully, then retry the write with an updated timestamp and everything
* else the same until we either exceed the configured retry count (in which
* case we abort the lock attempt) or successfully complete the write in less
* than {@code lockWait}.</li>
* <p/>
* <li>Wait, if necessary, until the time interval {@code lockWait} has passed
* between the timestamp on our successful write and the current time.</li>
* <p/>
* <li>Read all columns for the key we wrote in the first step.</li>
* <p/>
* <li>Discard any columns with timestamps older than {@code lockExpire}.</li>
* <p/>
* <li>If our column is either the first column read or is preceeded only by
* columns containing our own {@code rid}, then we hold the lock. Otherwise,
* another process holds the lock and we have failed to acquire it.</li>
* <p/>
* <li>To release the lock, we delete from the store the column that we
* wrote earlier in this sequence</li>
* </ol>
* <p/>
* <p/>
* As mentioned earlier, this class relies on {@link AbstractLocker} to obtain
* and release an intra-process lock before and after the sequence of steps
* listed above. The mediator step is necessary for thread-safety, because
* {@code rid} is only unique at the process level. Without a mediator, distinct
* threads could write lock columns with the same {@code rid} and be unable to
* tell their lock claims apart.
*/
public class ConsistentKeyLocker extends AbstractLocker<ConsistentKeyLockStatus> implements Locker {
/**
* Storage backend for locking records.
*/
private final KeyColumnValueStore store;
private final StoreManager manager;
/**
* This has units of {@code times.getUnit()}.
*/
private final Duration lockWait;
private final int lockRetryCount;
/**
* Expired lock cleaner in charge of {@link #store}.
*/
private final LockCleanerService cleanerService;
private static final StaticBuffer zeroBuf = BufferUtil.getIntBuffer(0); // TODO this does not belong here
/*
* In the storage backends, columns composed of one or more occurrences
* of a single byte sort from shortest to longest.
*
* A lock column is 9 or more bytes long:
*
* -------------------------------------
* | 8 bytes timestamp | var bytes rid |
* -------------------------------------
*
* A start bound of a single zero byte will always sort before the
* smallest timestamp due to length.
*
* The end bound is not as obvious. A timestamp with all-one-bits
* is eons away in the default configuration. However, it's theoretically
* possible with the nanos provider, since it relies on System.nanoTime,
* and the contract for nanoTime explicitly says that it may go negative.
*
* Fortunately, we can rely on the rid for our end bound. The rid is a
* user-customizable string that gets converted into a StaticBuffer via
* String.getBytes. This should UTF-8 encode it. Under UTF-8, a byte
* with all bits set is illegal. So, the 9th byte of a lock column
* should never have all bits set. This is why LOCK_END_COL is exactly
* 9 bytes long: it's just enough to extend past the timestamp and onto
* the first byte of a UTF-8 string, and that UTF-8 byte should have at
* least one zero bit that makes it sort before.
*/
public static final StaticBuffer LOCK_COL_START = BufferUtil.zeroBuffer(1);
public static final StaticBuffer LOCK_COL_END = BufferUtil.oneBuffer(9);
private static final Logger log = LoggerFactory.getLogger(ConsistentKeyLocker.class);
public static class Builder extends AbstractLocker.Builder<ConsistentKeyLockStatus, Builder> {
// Required (no default)
private final KeyColumnValueStore store;
private final StoreManager manager;
// Optional (has default)
private Duration lockWait;
private int lockRetryCount;
private enum CleanerConfig {
NONE,
STANDARD,
CUSTOM
};
private CleanerConfig cleanerConfig = CleanerConfig.NONE;
private LockCleanerService customCleanerService;
public Builder(KeyColumnValueStore store, StoreManager manager) {
this.store = store;
this.manager = manager;
this.lockWait = GraphDatabaseConfiguration.LOCK_WAIT.getDefaultValue();
this.lockRetryCount = GraphDatabaseConfiguration.LOCK_RETRY.getDefaultValue();
}
public Builder lockWait(Duration d) {
this.lockWait = d;
return self();
}
public Builder lockRetryCount(int count) {
this.lockRetryCount = count;
return self();
}
public Builder standardCleaner() {
this.cleanerConfig = CleanerConfig.STANDARD;
this.customCleanerService = null;
return self();
}
public Builder customCleaner(LockCleanerService s) {
this.cleanerConfig = CleanerConfig.CUSTOM;
this.customCleanerService = s;
Preconditions.checkNotNull(this.customCleanerService);
return self();
}
public Builder fromConfig(Configuration config) {
rid(new StaticArrayBuffer(config.get(GraphDatabaseConfiguration.UNIQUE_INSTANCE_ID).getBytes()));
final String llmPrefix = config.get(GraphDatabaseConfiguration.LOCK_LOCAL_MEDIATOR_GROUP);
times(config.get(GraphDatabaseConfiguration.TIMESTAMP_PROVIDER));
mediator(LocalLockMediators.INSTANCE.<StoreTransaction>get(llmPrefix, times));
lockRetryCount(config.get(GraphDatabaseConfiguration.LOCK_RETRY));
lockWait(config.get(GraphDatabaseConfiguration.LOCK_WAIT));
lockExpire(config.get(GraphDatabaseConfiguration.LOCK_EXPIRE));
if (config.get(GraphDatabaseConfiguration.LOCK_CLEAN_EXPIRED)) {
standardCleaner();
}
return this;
}
public ConsistentKeyLocker build() {
preBuild();
final LockCleanerService cleaner;
switch (cleanerConfig) {
case STANDARD:
Preconditions.checkArgument(null == customCleanerService);
cleaner = new StandardLockCleanerService(store, serializer, times);
break;
case CUSTOM:
Preconditions.checkArgument(null != customCleanerService);
cleaner = customCleanerService;
break;
default:
cleaner = null;
}
return new ConsistentKeyLocker(store, manager, rid, times,
serializer, llm,
lockWait,
lockRetryCount,
lockExpire,
lockState, cleaner);
}
@Override
protected Builder self() {
return this;
}
@Override
protected LocalLockMediator<StoreTransaction> getDefaultMediator() {
throw new JanusGraphConfigurationException("Local lock mediator prefix must not be empty or null");
}
}
/**
* Create a new locker.
*
*/
private ConsistentKeyLocker(KeyColumnValueStore store, StoreManager manager, StaticBuffer rid,
TimestampProvider times, ConsistentKeyLockerSerializer serializer,
LocalLockMediator<StoreTransaction> llm, Duration lockWait,
int lockRetryCount, Duration lockExpire,
LockerState<ConsistentKeyLockStatus> lockState,
LockCleanerService cleanerService) {
super(rid, times, serializer, llm, lockState, lockExpire, log);
this.store = store;
this.manager = manager;
this.lockWait = lockWait;
this.lockRetryCount = lockRetryCount;
this.cleanerService = cleanerService;
}
/**
* Try to write a lock record remotely up to the configured number of
* times. If the store produces
* {@link TemporaryLockingException}, then we'll call mutate again to add a
* new column with an updated timestamp and to delete the column that tried
* to write when the store threw an exception. We continue like that up to
* the retry limit. If the store throws anything else, such as an unchecked
* exception or a {@link org.janusgraph.diskstorage.PermanentBackendException}, then we'll try to
* delete whatever we added and return without further retries.
*
* @param lockID lock to acquire
* @param txh transaction
* @return the timestamp, in nanoseconds since UNIX Epoch, on the lock
* column that we successfully wrote to the store
* @throws TemporaryLockingException if the lock retry count is exceeded without successfully
* writing the lock in less than the wait limit
* @throws Throwable if the storage layer throws anything else
*/
@Override
protected ConsistentKeyLockStatus writeSingleLock(KeyColumn lockID, StoreTransaction txh) throws Throwable {
final StaticBuffer lockKey = serializer.toLockKey(lockID.getKey(), lockID.getColumn());
StaticBuffer oldLockCol = null;
for (int i = 0; i < lockRetryCount; i++) {
WriteResult wr = tryWriteLockOnce(lockKey, oldLockCol, txh);
if (wr.isSuccessful() && wr.getDuration().compareTo(lockWait) <= 0) {
final Instant writeInstant = wr.getWriteTimestamp();
final Instant expireInstant = writeInstant.plus(lockExpire);
return new ConsistentKeyLockStatus(writeInstant, expireInstant);
}
oldLockCol = wr.getLockCol();
handleMutationFailure(lockID, lockKey, wr, txh);
}
tryDeleteLockOnce(lockKey, oldLockCol, txh);
// TODO log exception or successful too-slow write here
throw new TemporaryBackendException("Lock write retry count exceeded");
}
/**
* Log a message and/or throw an exception in response to a lock write
* mutation that failed. "Failed" means that the mutation either succeeded
* but took longer to complete than configured lock wait time, or that
* the call to mutate threw something.
*
* @param lockID coordinates identifying the lock we tried but failed to
* acquire
* @param lockKey the byte value of the key that we mutated or attempted to
* mutate in the lock store
* @param wr result of the mutation
* @param txh transaction attempting the lock
* @throws Throwable if {@link WriteResult#getThrowable()} is not an instance of
* {@link org.janusgraph.diskstorage.TemporaryBackendException}
*/
private void handleMutationFailure(KeyColumn lockID, StaticBuffer lockKey, WriteResult wr, StoreTransaction txh) throws Throwable {
Throwable error = wr.getThrowable();
if (null != error) {
if (error instanceof TemporaryBackendException) {
// Log error and continue the loop
log.warn("Temporary exception during lock write", error);
} else {
/*
* A PermanentStorageException or an unchecked exception. Try to
* delete any previous writes and then die. Do not retry even if
* we have retries left.
*/
log.error("Fatal exception encountered during attempted lock write", error);
WriteResult dwr = tryDeleteLockOnce(lockKey, wr.getLockCol(), txh);
if (!dwr.isSuccessful()) {
log.warn("Failed to delete lock write: abandoning potentially-unreleased lock on " + lockID, dwr.getThrowable());
}
throw error;
}
} else {
log.warn("Lock write succeeded but took too long: duration {} exceeded limit {}", wr.getDuration(), lockWait);
}
}
private WriteResult tryWriteLockOnce(StaticBuffer key, StaticBuffer del, StoreTransaction txh) {
Throwable t = null;
final Timer writeTimer = times.getTimer().start();
StaticBuffer newLockCol = serializer.toLockCol(writeTimer.getStartTime(), rid, times);
Entry newLockEntry = StaticArrayEntry.of(newLockCol, zeroBuf);
try {
StoreTransaction newTx = overrideTimestamp(txh, writeTimer.getStartTime());
store.mutate(key, Arrays.asList(newLockEntry), null == del ? KeyColumnValueStore.NO_DELETIONS : Arrays.asList(del), newTx);
} catch (BackendException e) {
log.debug("Lock write attempt failed with exception", e);
t = e;
}
writeTimer.stop();
return new WriteResult(writeTimer.elapsed(), writeTimer.getStartTime(), newLockCol, t);
}
private WriteResult tryDeleteLockOnce(StaticBuffer key, StaticBuffer col, StoreTransaction txh) {
Throwable t = null;
final Timer delTimer = times.getTimer().start();
try {
StoreTransaction newTx = overrideTimestamp(txh, delTimer.getStartTime());
store.mutate(key, ImmutableList.<Entry>of(), Arrays.asList(col), newTx);
} catch (BackendException e) {
t = e;
}
delTimer.stop();
return new WriteResult(delTimer.elapsed(), delTimer.getStartTime(), null, t);
}
@Override
protected void checkSingleLock(final KeyColumn kc, final ConsistentKeyLockStatus ls, final StoreTransaction tx) throws BackendException, InterruptedException {
if (ls.isChecked())
return;
// Sleep, if necessary
// We could be smarter about sleeping by iterating oldest -> latest...
final Instant now = times.sleepPast(ls.getWriteTimestamp().plus(lockWait));
// Slice the store
KeySliceQuery ksq = new KeySliceQuery(serializer.toLockKey(kc.getKey(), kc.getColumn()), LOCK_COL_START, LOCK_COL_END);
List<Entry> claimEntries = getSliceWithRetries(ksq, tx);
// Extract timestamp and rid from the column in each returned Entry...
Iterable<TimestampRid> iter = Iterables.transform(claimEntries, new Function<Entry, TimestampRid>() {
@Override
public TimestampRid apply(Entry e) {
return serializer.fromLockColumn(e.getColumnAs(StaticBuffer.STATIC_FACTORY), times);
}
});
// ...and then filter out the TimestampRid objects with expired timestamps
// (This doesn't use Iterables.filter and Predicate so that we can throw a checked exception if necessary)
ArrayList<TimestampRid> unexpiredTRs = new ArrayList<TimestampRid>(Iterables.size(iter));
for (TimestampRid tr : iter) {
final Instant cutoffTime = now.minus(lockExpire);
if (tr.getTimestamp().isBefore(cutoffTime)) {
log.warn("Discarded expired claim on {} with timestamp {}", kc, tr.getTimestamp());
if (null != cleanerService)
cleanerService.clean(kc, cutoffTime, tx);
// Locks that this instance wrote that have now expired should not only log but also throw a descriptive exception
if (rid.equals(tr.getRid()) && ls.getWriteTimestamp().equals(tr.getTimestamp())) {
throw new ExpiredLockException("Expired lock on " + kc.toString() +
": lock timestamp " + tr.getTimestamp() + " " + times.getUnit() + " is older than " +
ConfigElement.getPath(GraphDatabaseConfiguration.LOCK_EXPIRE) + "=" + lockExpire);
// Really shouldn't refer to GDC.LOCK_EXPIRE here, but this will typically be accurate in a real use case
}
continue;
}
unexpiredTRs.add(tr);
}
checkSeniority(kc, ls, unexpiredTRs);
ls.setChecked();
}
private List<Entry> getSliceWithRetries(KeySliceQuery ksq, StoreTransaction tx) throws BackendException {
for (int i = 0; i < lockRetryCount; i++) {
// TODO either make this like writeLock so that it handles all Throwable types (and pull that logic out into a shared method) or make writeLock like this in that it only handles Temporary/PermanentSE
try {
return store.getSlice(ksq, tx);
} catch (PermanentBackendException e) {
log.error("Failed to check locks", e);
throw new PermanentLockingException(e);
} catch (TemporaryBackendException e) {
log.warn("Temporary storage failure while checking locks", e);
}
}
throw new TemporaryBackendException("Maximum retries (" + lockRetryCount + ") exceeded while checking locks");
}
private void checkSeniority(KeyColumn target, ConsistentKeyLockStatus ls, Iterable<TimestampRid> claimTRs) throws BackendException {
int trCount = 0;
for (TimestampRid tr : claimTRs) {
trCount++;
if (!rid.equals(tr.getRid())) {
final String msg = "Lock on " + target + " already held by " + tr.getRid() + " (we are " + rid + ")";
log.debug(msg);
throw new TemporaryLockingException(msg);
}
if (tr.getTimestamp().equals(ls.getWriteTimestamp())) {
// log.debug("Checked lock {} in store {}", target, store.getName());
log.debug("Checked lock {}", target);
return;
}
log.warn("Skipping outdated lock on {} with our rid ({}) but mismatched timestamp (actual ts {}, expected ts {})",
new Object[]{target, tr.getRid(), tr.getTimestamp(),
ls.getWriteTimestamp()});
}
/*
* Both exceptions below shouldn't happen under normal operation with a
* sane configuration. When they are thrown, they have one of two likely
* root causes:
*
* 1. Due to a problem with this locker's store configuration or the
* store itself, this locker's store "lost" a write. Specifically, a
* column previously added to the store by writeLock(...) was not
* returned on a subsequent read by checkLocks(...). The precise root
* cause is store-specific. With Cassandra, for instance, this problem
* could arise if the locker is configured to talk to Cassandra at a
* consistency level below QUORUM.
*
* 2. One of our previously written locks has already expired by the
* time we tried to read it.
*
* There might be additional causes that haven't occurred to me, but
* these two seem most likely.
*/
if (0 == trCount) {
throw new TemporaryLockingException("No lock columns found for " + target);
} else {
final String msg = "Read "
+ trCount
+ " locks with our rid "
+ rid
+ " but mismatched timestamps; no lock column contained our timestamp ("
+ ls.getWriteTimestamp() + ")";
throw new PermanentBackendException(msg);
}
}
@Override
protected void deleteSingleLock(KeyColumn kc, ConsistentKeyLockStatus ls, StoreTransaction tx) {
List<StaticBuffer> dels = ImmutableList.of(serializer.toLockCol(ls.getWriteTimestamp(), rid, times));
for (int i = 0; i < lockRetryCount; i++) {
try {
StoreTransaction newTx = overrideTimestamp(tx, times.getTime());
store.mutate(serializer.toLockKey(kc.getKey(), kc.getColumn()), ImmutableList.<Entry>of(), dels, newTx);
return;
} catch (TemporaryBackendException e) {
log.warn("Temporary storage exception while deleting lock", e);
// don't return -- iterate and retry
} catch (BackendException e) {
log.error("Storage exception while deleting lock", e);
return; // give up on this lock
}
}
}
private StoreTransaction overrideTimestamp(final StoreTransaction tx, final Instant commitTime) throws BackendException {
StandardBaseTransactionConfig newCfg = new StandardBaseTransactionConfig.Builder(tx.getConfiguration())
.commitTime(commitTime).build();
return manager.beginTransaction(newCfg);
}
private static class WriteResult {
private final Duration duration;
private final Instant writeTimestamp;
private final StaticBuffer lockCol;
private final Throwable throwable;
public WriteResult(Duration duration, Instant writeTimestamp, StaticBuffer lockCol, Throwable throwable) {
this.duration = duration;
this.writeTimestamp = writeTimestamp;
this.lockCol = lockCol;
this.throwable = throwable;
}
public Duration getDuration() {
return duration;
}
public Instant getWriteTimestamp() {
return writeTimestamp;
}
public boolean isSuccessful() {
return null == throwable;
}
public StaticBuffer getLockCol() {
return lockCol;
}
public Throwable getThrowable() {
return throwable;
}
}
}