/**
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.aurora.scheduler.storage.log;
import java.io.IOException;
import java.util.Date;
import java.util.Map;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.ReentrantLock;
import java.util.function.Consumer;
import javax.inject.Inject;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Optional;
import com.google.common.collect.ImmutableMap;
import com.google.common.util.concurrent.MoreExecutors;
import org.apache.aurora.codec.ThriftBinaryCodec.CodingException;
import org.apache.aurora.common.application.ShutdownRegistry;
import org.apache.aurora.common.inject.TimedInterceptor.Timed;
import org.apache.aurora.common.quantity.Amount;
import org.apache.aurora.common.quantity.Time;
import org.apache.aurora.common.stats.SlidingStats;
import org.apache.aurora.gen.HostAttributes;
import org.apache.aurora.gen.storage.LogEntry;
import org.apache.aurora.gen.storage.Op;
import org.apache.aurora.gen.storage.RewriteTask;
import org.apache.aurora.gen.storage.SaveCronJob;
import org.apache.aurora.gen.storage.SaveJobInstanceUpdateEvent;
import org.apache.aurora.gen.storage.SaveJobUpdateEvent;
import org.apache.aurora.gen.storage.SaveQuota;
import org.apache.aurora.gen.storage.Snapshot;
import org.apache.aurora.scheduler.base.AsyncUtil;
import org.apache.aurora.scheduler.base.SchedulerException;
import org.apache.aurora.scheduler.events.EventSink;
import org.apache.aurora.scheduler.log.Log.Stream.InvalidPositionException;
import org.apache.aurora.scheduler.log.Log.Stream.StreamAccessException;
import org.apache.aurora.scheduler.storage.AttributeStore;
import org.apache.aurora.scheduler.storage.CronJobStore;
import org.apache.aurora.scheduler.storage.DistributedSnapshotStore;
import org.apache.aurora.scheduler.storage.JobUpdateStore;
import org.apache.aurora.scheduler.storage.LockStore;
import org.apache.aurora.scheduler.storage.QuotaStore;
import org.apache.aurora.scheduler.storage.SchedulerStore;
import org.apache.aurora.scheduler.storage.SnapshotStore;
import org.apache.aurora.scheduler.storage.Storage;
import org.apache.aurora.scheduler.storage.Storage.MutateWork.NoResult;
import org.apache.aurora.scheduler.storage.Storage.NonVolatileStorage;
import org.apache.aurora.scheduler.storage.TaskStore;
import org.apache.aurora.scheduler.storage.entities.IHostAttributes;
import org.apache.aurora.scheduler.storage.entities.IJobInstanceUpdateEvent;
import org.apache.aurora.scheduler.storage.entities.IJobKey;
import org.apache.aurora.scheduler.storage.entities.IJobUpdateEvent;
import org.apache.aurora.scheduler.storage.entities.IJobUpdateKey;
import org.apache.aurora.scheduler.storage.entities.ILock;
import org.apache.aurora.scheduler.storage.entities.ILockKey;
import org.apache.aurora.scheduler.storage.entities.ITaskConfig;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static java.util.Objects.requireNonNull;
/**
* A storage implementation that ensures committed transactions are written to a log.
*
* <p>In the classic write-ahead log usage we'd perform mutations as follows:
* <ol>
* <li>write op to log</li>
* <li>perform op locally</li>
* <li>*checkpoint</li>
* </ol>
*
* <p>Writing the operation to the log provides us with a fast persistence mechanism to ensure we
* have a record of our mutation in case we should need to recover state later after a crash or on
* a new host (assuming the log is distributed). We then apply the mutation to a local (in-memory)
* data structure for serving fast read requests and then optionally write down the position of the
* log entry we wrote in the first step to stable storage to allow for quicker recovery after a
* crash. Instead of reading the whole log, we can read all entries past the checkpoint. This
* design implies that all mutations must be idempotent and free from constraint and thus
* replayable over newer operations when recovering from an old checkpoint.
*
* <p>The important detail in our case is the possibility of writing an op to the log, and then
* failing to commit locally since we use a local database instead of an in-memory data structure.
* If we die after such a failure, then another instance can read and apply the logged op
* erroneously.
*
* <p>This implementation leverages a local transaction to handle this:
* <ol>
* <li>start local transaction</li>
* <li>perform op locally (uncommitted!)</li>
* <li>write op to log</li>
* <li>commit local transaction</li>
* <li>*checkpoint</li>
* </ol>
*
* <p>If the op fails to apply to local storage we will never write the op to the log and if the op
* fails to apply to the log, it'll throw and abort the local storage transaction as well.
*/
public class LogStorage implements NonVolatileStorage, DistributedSnapshotStore {
/**
* A service that can schedule an action to be executed periodically.
*/
@VisibleForTesting
interface SchedulingService {
/**
* Schedules an action to execute periodically.
*
* @param interval The time period to wait until running the {@code action} again.
* @param action The action to execute periodically.
*/
void doEvery(Amount<Long, Time> interval, Runnable action);
}
/**
* A maintainer for context about open transactions. Assumes that an external entity is
* responsible for opening and closing transactions.
*/
interface TransactionManager {
/**
* Checks whether there is an open transaction.
*
* @return {@code true} if there is an open transaction, {@code false} otherwise.
*/
boolean hasActiveTransaction();
/**
* Adds an operation to the existing transaction.
*
* @param op Operation to include in the existing transaction.
*/
void log(Op op);
}
private static class ScheduledExecutorSchedulingService implements SchedulingService {
private final ScheduledExecutorService scheduledExecutor;
ScheduledExecutorSchedulingService(ShutdownRegistry shutdownRegistry,
Amount<Long, Time> shutdownGracePeriod) {
scheduledExecutor = AsyncUtil.singleThreadLoggingScheduledExecutor("LogStorage-%d", LOG);
shutdownRegistry.addAction(() -> MoreExecutors.shutdownAndAwaitTermination(
scheduledExecutor,
shutdownGracePeriod.getValue(),
shutdownGracePeriod.getUnit().getTimeUnit()));
}
@Override
public void doEvery(Amount<Long, Time> interval, Runnable action) {
requireNonNull(interval);
requireNonNull(action);
long delay = interval.getValue();
TimeUnit timeUnit = interval.getUnit().getTimeUnit();
scheduledExecutor.scheduleWithFixedDelay(action, delay, delay, timeUnit);
}
}
private static final Logger LOG = LoggerFactory.getLogger(LogStorage.class);
private final LogManager logManager;
private final SchedulingService schedulingService;
private final SnapshotStore<Snapshot> snapshotStore;
private final Amount<Long, Time> snapshotInterval;
private final Storage writeBehindStorage;
private final SchedulerStore.Mutable writeBehindSchedulerStore;
private final CronJobStore.Mutable writeBehindJobStore;
private final TaskStore.Mutable writeBehindTaskStore;
private final LockStore.Mutable writeBehindLockStore;
private final QuotaStore.Mutable writeBehindQuotaStore;
private final AttributeStore.Mutable writeBehindAttributeStore;
private final JobUpdateStore.Mutable writeBehindJobUpdateStore;
private final ReentrantLock writeLock;
private final ThriftBackfill thriftBackfill;
private StreamManager streamManager;
private final WriteAheadStorage writeAheadStorage;
// TODO(wfarner): It should be possible to remove this flag now, since all call stacks when
// recovering are controlled at this layer (they're all calls to Mutable store implementations).
// The more involved change is changing SnapshotStore to accept a Mutable store provider to
// avoid a call to Storage.write() when we replay a Snapshot.
private boolean recovered = false;
private StreamTransaction transaction = null;
private final SlidingStats writerWaitStats =
new SlidingStats("log_storage_write_lock_wait", "ns");
private final Map<LogEntry._Fields, Consumer<LogEntry>> logEntryReplayActions;
private final Map<Op._Fields, Consumer<Op>> transactionReplayActions;
@Inject
LogStorage(
LogManager logManager,
ShutdownRegistry shutdownRegistry,
Settings settings,
SnapshotStore<Snapshot> snapshotStore,
@Volatile Storage storage,
@Volatile SchedulerStore.Mutable schedulerStore,
@Volatile CronJobStore.Mutable jobStore,
@Volatile TaskStore.Mutable taskStore,
@Volatile LockStore.Mutable lockStore,
@Volatile QuotaStore.Mutable quotaStore,
@Volatile AttributeStore.Mutable attributeStore,
@Volatile JobUpdateStore.Mutable jobUpdateStore,
EventSink eventSink,
ReentrantLock writeLock,
ThriftBackfill thriftBackfill) {
this(logManager,
new ScheduledExecutorSchedulingService(shutdownRegistry, settings.getShutdownGracePeriod()),
snapshotStore,
settings.getSnapshotInterval(),
storage,
schedulerStore,
jobStore,
taskStore,
lockStore,
quotaStore,
attributeStore,
jobUpdateStore,
eventSink,
writeLock,
thriftBackfill);
}
@VisibleForTesting
LogStorage(
LogManager logManager,
SchedulingService schedulingService,
SnapshotStore<Snapshot> snapshotStore,
Amount<Long, Time> snapshotInterval,
Storage delegateStorage,
SchedulerStore.Mutable schedulerStore,
CronJobStore.Mutable jobStore,
TaskStore.Mutable taskStore,
LockStore.Mutable lockStore,
QuotaStore.Mutable quotaStore,
AttributeStore.Mutable attributeStore,
JobUpdateStore.Mutable jobUpdateStore,
EventSink eventSink,
ReentrantLock writeLock,
ThriftBackfill thriftBackfill) {
this.logManager = requireNonNull(logManager);
this.schedulingService = requireNonNull(schedulingService);
this.snapshotStore = requireNonNull(snapshotStore);
this.snapshotInterval = requireNonNull(snapshotInterval);
// Log storage has two distinct operating modes: pre- and post-recovery. When recovering,
// we write directly to the writeBehind stores since we are replaying what's already persisted.
// After that, all writes must succeed in the distributed log before they may be considered
// successful.
this.writeBehindStorage = requireNonNull(delegateStorage);
this.writeBehindSchedulerStore = requireNonNull(schedulerStore);
this.writeBehindJobStore = requireNonNull(jobStore);
this.writeBehindTaskStore = requireNonNull(taskStore);
this.writeBehindLockStore = requireNonNull(lockStore);
this.writeBehindQuotaStore = requireNonNull(quotaStore);
this.writeBehindAttributeStore = requireNonNull(attributeStore);
this.writeBehindJobUpdateStore = requireNonNull(jobUpdateStore);
this.writeLock = requireNonNull(writeLock);
this.thriftBackfill = requireNonNull(thriftBackfill);
TransactionManager transactionManager = new TransactionManager() {
@Override
public boolean hasActiveTransaction() {
return transaction != null;
}
@Override
public void log(Op op) {
transaction.add(op);
}
};
this.writeAheadStorage = new WriteAheadStorage(
transactionManager,
schedulerStore,
jobStore,
taskStore,
lockStore,
quotaStore,
attributeStore,
jobUpdateStore,
LoggerFactory.getLogger(WriteAheadStorage.class),
eventSink);
this.logEntryReplayActions = buildLogEntryReplayActions();
this.transactionReplayActions = buildTransactionReplayActions();
}
@VisibleForTesting
final Map<LogEntry._Fields, Consumer<LogEntry>> buildLogEntryReplayActions() {
return ImmutableMap.<LogEntry._Fields, Consumer<LogEntry>>builder()
.put(LogEntry._Fields.SNAPSHOT, logEntry -> {
Snapshot snapshot = logEntry.getSnapshot();
LOG.info("Applying snapshot taken on " + new Date(snapshot.getTimestamp()));
snapshotStore.applySnapshot(snapshot);
})
.put(LogEntry._Fields.TRANSACTION, logEntry -> write((NoResult.Quiet) unused -> {
for (Op op : logEntry.getTransaction().getOps()) {
replayOp(op);
}
}))
.put(LogEntry._Fields.NOOP, item -> {
// Nothing to do here
})
.build();
}
@VisibleForTesting
final Map<Op._Fields, Consumer<Op>> buildTransactionReplayActions() {
return ImmutableMap.<Op._Fields, Consumer<Op>>builder()
.put(
Op._Fields.SAVE_FRAMEWORK_ID,
op -> writeBehindSchedulerStore.saveFrameworkId(op.getSaveFrameworkId().getId()))
.put(Op._Fields.SAVE_CRON_JOB, op -> {
SaveCronJob cronJob = op.getSaveCronJob();
writeBehindJobStore.saveAcceptedJob(
thriftBackfill.backfillJobConfiguration(cronJob.getJobConfig()));
})
.put(
Op._Fields.REMOVE_JOB,
op -> writeBehindJobStore.removeJob(IJobKey.build(op.getRemoveJob().getJobKey())))
.put(
Op._Fields.SAVE_TASKS,
op -> writeBehindTaskStore.saveTasks(
thriftBackfill.backfillTasks(op.getSaveTasks().getTasks())))
.put(Op._Fields.REWRITE_TASK, op -> {
RewriteTask rewriteTask = op.getRewriteTask();
writeBehindTaskStore.unsafeModifyInPlace(
rewriteTask.getTaskId(),
ITaskConfig.build(rewriteTask.getTask()));
})
.put(
Op._Fields.REMOVE_TASKS,
op -> writeBehindTaskStore.deleteTasks(op.getRemoveTasks().getTaskIds()))
.put(Op._Fields.SAVE_QUOTA, op -> {
SaveQuota saveQuota = op.getSaveQuota();
writeBehindQuotaStore.saveQuota(
saveQuota.getRole(),
ThriftBackfill.backfillResourceAggregate(saveQuota.getQuota()));
})
.put(
Op._Fields.REMOVE_QUOTA,
op -> writeBehindQuotaStore.removeQuota(op.getRemoveQuota().getRole()))
.put(Op._Fields.SAVE_HOST_ATTRIBUTES, op -> {
HostAttributes attributes = op.getSaveHostAttributes().getHostAttributes();
// Prior to commit 5cf760b, the store would persist maintenance mode changes for
// unknown hosts. 5cf760b began rejecting these, but the replicated log may still
// contain entries with a null slave ID.
if (attributes.isSetSlaveId()) {
writeBehindAttributeStore.saveHostAttributes(IHostAttributes.build(attributes));
} else {
LOG.info("Dropping host attributes with no agent ID: " + attributes);
}
})
.put(
Op._Fields.SAVE_LOCK,
op -> writeBehindLockStore.saveLock(ILock.build(op.getSaveLock().getLock())))
.put(
Op._Fields.REMOVE_LOCK,
op -> writeBehindLockStore.removeLock(ILockKey.build(op.getRemoveLock().getLockKey())))
.put(Op._Fields.SAVE_JOB_UPDATE, op ->
writeBehindJobUpdateStore.saveJobUpdate(
thriftBackfill.backFillJobUpdate(op.getSaveJobUpdate().getJobUpdate()),
Optional.fromNullable(op.getSaveJobUpdate().getLockToken())))
.put(Op._Fields.SAVE_JOB_UPDATE_EVENT, op -> {
SaveJobUpdateEvent event = op.getSaveJobUpdateEvent();
writeBehindJobUpdateStore.saveJobUpdateEvent(
IJobUpdateKey.build(event.getKey()),
IJobUpdateEvent.build(op.getSaveJobUpdateEvent().getEvent()));
})
.put(Op._Fields.SAVE_JOB_INSTANCE_UPDATE_EVENT, op -> {
SaveJobInstanceUpdateEvent event = op.getSaveJobInstanceUpdateEvent();
writeBehindJobUpdateStore.saveJobInstanceUpdateEvent(
IJobUpdateKey.build(event.getKey()),
IJobInstanceUpdateEvent.build(op.getSaveJobInstanceUpdateEvent().getEvent()));
})
.put(Op._Fields.PRUNE_JOB_UPDATE_HISTORY, op -> writeBehindJobUpdateStore.pruneHistory(
op.getPruneJobUpdateHistory().getPerJobRetainCount(),
op.getPruneJobUpdateHistory().getHistoryPruneThresholdMs())).build();
}
@Override
@Timed("scheduler_storage_prepare")
public synchronized void prepare() {
writeBehindStorage.prepare();
// Open the log to make a log replica available to the scheduler group.
try {
streamManager = logManager.open();
} catch (IOException e) {
throw new IllegalStateException("Failed to open the log, cannot continue", e);
}
}
@Override
@Timed("scheduler_storage_start")
public synchronized void start(final MutateWork.NoResult.Quiet initializationLogic) {
write((NoResult.Quiet) unused -> {
// Must have the underlying storage started so we can query it for the last checkpoint.
// We replay these entries in the forwarded storage system's transactions but not ours - we
// do not want to re-record these ops to the log.
recover();
recovered = true;
// Now that we're recovered we should let any mutations done in initializationLogic append
// to the log, so run it in one of our transactions.
write(initializationLogic);
});
scheduleSnapshots();
}
@Override
public void stop() {
// No-op.
}
@Timed("scheduler_log_recover")
void recover() throws RecoveryFailedException {
try {
streamManager.readFromBeginning(LogStorage.this::replay);
} catch (CodingException | InvalidPositionException | StreamAccessException e) {
throw new RecoveryFailedException(e);
}
}
private static final class RecoveryFailedException extends SchedulerException {
RecoveryFailedException(Throwable cause) {
super(cause);
}
}
private void replay(final LogEntry logEntry) {
LogEntry._Fields entryField = logEntry.getSetField();
if (!logEntryReplayActions.containsKey(entryField)) {
throw new IllegalStateException("Unknown log entry type: " + entryField);
}
logEntryReplayActions.get(entryField).accept(logEntry);
}
private void replayOp(Op op) {
Op._Fields opField = op.getSetField();
if (!transactionReplayActions.containsKey(opField)) {
throw new IllegalStateException("Unknown transaction op: " + opField);
}
transactionReplayActions.get(opField).accept(op);
}
private void scheduleSnapshots() {
if (snapshotInterval.getValue() > 0) {
schedulingService.doEvery(snapshotInterval, () -> {
try {
snapshot();
} catch (StorageException e) {
if (e.getCause() == null) {
LOG.warn("StorageException when attempting to snapshot.", e);
} else {
LOG.warn(e.getMessage(), e.getCause());
}
}
});
}
}
/**
* Forces a snapshot of the storage state.
*
* @throws CodingException If there is a problem encoding the snapshot.
* @throws InvalidPositionException If the log stream cursor is invalid.
* @throws StreamAccessException If there is a problem writing the snapshot to the log stream.
*/
@Timed("scheduler_log_snapshot")
void doSnapshot() throws CodingException, InvalidPositionException, StreamAccessException {
write((NoResult<CodingException>) (MutableStoreProvider unused) -> {
LOG.info("Creating snapshot.");
Snapshot snapshot = snapshotStore.createSnapshot();
persist(snapshot);
LOG.info("Snapshot complete."
+ " host attrs: " + snapshot.getHostAttributesSize()
+ ", cron jobs: " + snapshot.getCronJobsSize()
+ ", locks: " + snapshot.getLocksSize()
+ ", quota confs: " + snapshot.getQuotaConfigurationsSize()
+ ", tasks: " + snapshot.getTasksSize()
+ ", updates: " + snapshot.getJobUpdateDetailsSize());
});
}
@Timed("scheduler_log_snapshot_persist")
@Override
public void persist(Snapshot snapshot)
throws CodingException, InvalidPositionException, StreamAccessException {
streamManager.snapshot(snapshot);
}
private <T, E extends Exception> T doInTransaction(final MutateWork<T, E> work)
throws StorageException, E {
// The log stream transaction has already been set up so we just need to delegate with our
// store provider so any mutations performed by work get logged.
if (transaction != null) {
return work.apply(writeAheadStorage);
}
transaction = streamManager.startTransaction();
try {
return writeBehindStorage.write(unused -> {
T result = work.apply(writeAheadStorage);
try {
transaction.commit();
} catch (CodingException e) {
throw new IllegalStateException(
"Problem encoding transaction operations to the log stream", e);
} catch (StreamAccessException e) {
throw new StorageException(
"There was a problem committing the transaction to the log.", e);
}
return result;
});
} finally {
transaction = null;
}
}
@Override
public <T, E extends Exception> T write(final MutateWork<T, E> work) throws StorageException, E {
long waitStart = System.nanoTime();
writeLock.lock();
try {
writerWaitStats.accumulate(System.nanoTime() - waitStart);
// We don't want to use the log when recovering from it, we just want to update the underlying
// store - so pass mutations straight through to the underlying storage.
if (!recovered) {
return writeBehindStorage.write(work);
}
return doInTransaction(work);
} finally {
writeLock.unlock();
}
}
@Override
public <T, E extends Exception> T read(Work<T, E> work) throws StorageException, E {
return writeBehindStorage.read(work);
}
@Override
public void snapshot() throws StorageException {
try {
doSnapshot();
} catch (CodingException e) {
throw new StorageException("Failed to encode a snapshot", e);
} catch (InvalidPositionException e) {
throw new StorageException("Saved snapshot but failed to truncate entries preceding it", e);
} catch (StreamAccessException e) {
throw new StorageException("Failed to create a snapshot", e);
}
}
/**
* Configuration settings for log storage.
*/
public static class Settings {
private final Amount<Long, Time> shutdownGracePeriod;
private final Amount<Long, Time> snapshotInterval;
public Settings(Amount<Long, Time> shutdownGracePeriod, Amount<Long, Time> snapshotInterval) {
this.shutdownGracePeriod = requireNonNull(shutdownGracePeriod);
this.snapshotInterval = requireNonNull(snapshotInterval);
}
public Amount<Long, Time> getShutdownGracePeriod() {
return shutdownGracePeriod;
}
public Amount<Long, Time> getSnapshotInterval() {
return snapshotInterval;
}
}
}