/**
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.aurora.scheduler.updater;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ScheduledExecutorService;
import com.google.common.base.Function;
import com.google.common.base.Optional;
import com.google.common.base.Throwables;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableMultimap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterables;
import com.google.common.collect.Maps;
import com.google.common.collect.Multimap;
import com.google.common.collect.Multimaps;
import com.google.common.collect.Ordering;
import com.google.common.eventbus.EventBus;
import com.google.common.primitives.Ints;
import com.google.inject.AbstractModule;
import com.google.inject.Guice;
import com.google.inject.Injector;
import org.apache.aurora.common.application.Lifecycle;
import org.apache.aurora.common.base.Command;
import org.apache.aurora.common.quantity.Amount;
import org.apache.aurora.common.quantity.Time;
import org.apache.aurora.common.stats.Stats;
import org.apache.aurora.common.stats.StatsProvider;
import org.apache.aurora.common.testing.easymock.EasyMockTest;
import org.apache.aurora.common.util.Clock;
import org.apache.aurora.common.util.TruncatedBinaryBackoff;
import org.apache.aurora.gen.InstanceTaskConfig;
import org.apache.aurora.gen.JobUpdate;
import org.apache.aurora.gen.JobUpdateAction;
import org.apache.aurora.gen.JobUpdateEvent;
import org.apache.aurora.gen.JobUpdateInstructions;
import org.apache.aurora.gen.JobUpdateKey;
import org.apache.aurora.gen.JobUpdatePulseStatus;
import org.apache.aurora.gen.JobUpdateSettings;
import org.apache.aurora.gen.JobUpdateState;
import org.apache.aurora.gen.JobUpdateStatus;
import org.apache.aurora.gen.JobUpdateSummary;
import org.apache.aurora.gen.LockKey;
import org.apache.aurora.gen.Metadata;
import org.apache.aurora.gen.Range;
import org.apache.aurora.gen.ScheduleStatus;
import org.apache.aurora.gen.ScheduledTask;
import org.apache.aurora.gen.TaskConfig;
import org.apache.aurora.scheduler.SchedulerModule.TaskEventBatchWorker;
import org.apache.aurora.scheduler.TaskIdGenerator;
import org.apache.aurora.scheduler.TaskIdGenerator.TaskIdGeneratorImpl;
import org.apache.aurora.scheduler.base.JobKeys;
import org.apache.aurora.scheduler.base.Query;
import org.apache.aurora.scheduler.base.TaskTestUtil;
import org.apache.aurora.scheduler.base.Tasks;
import org.apache.aurora.scheduler.events.EventSink;
import org.apache.aurora.scheduler.events.PubsubEvent;
import org.apache.aurora.scheduler.mesos.Driver;
import org.apache.aurora.scheduler.scheduling.RescheduleCalculator;
import org.apache.aurora.scheduler.scheduling.RescheduleCalculator.RescheduleCalculatorImpl;
import org.apache.aurora.scheduler.state.LockManager;
import org.apache.aurora.scheduler.state.LockManagerImpl;
import org.apache.aurora.scheduler.state.StateChangeResult;
import org.apache.aurora.scheduler.state.StateManager;
import org.apache.aurora.scheduler.state.StateManagerImpl;
import org.apache.aurora.scheduler.state.UUIDGenerator;
import org.apache.aurora.scheduler.state.UUIDGenerator.UUIDGeneratorImpl;
import org.apache.aurora.scheduler.storage.JobUpdateStore;
import org.apache.aurora.scheduler.storage.Storage;
import org.apache.aurora.scheduler.storage.db.DbModule;
import org.apache.aurora.scheduler.storage.entities.IInstanceTaskConfig;
import org.apache.aurora.scheduler.storage.entities.IJobInstanceUpdateEvent;
import org.apache.aurora.scheduler.storage.entities.IJobKey;
import org.apache.aurora.scheduler.storage.entities.IJobUpdate;
import org.apache.aurora.scheduler.storage.entities.IJobUpdateDetails;
import org.apache.aurora.scheduler.storage.entities.IJobUpdateEvent;
import org.apache.aurora.scheduler.storage.entities.IJobUpdateKey;
import org.apache.aurora.scheduler.storage.entities.IJobUpdateSummary;
import org.apache.aurora.scheduler.storage.entities.ILock;
import org.apache.aurora.scheduler.storage.entities.ILockKey;
import org.apache.aurora.scheduler.storage.entities.IScheduledTask;
import org.apache.aurora.scheduler.storage.entities.ITaskConfig;
import org.apache.aurora.scheduler.testing.FakeScheduledExecutor;
import org.apache.aurora.scheduler.testing.FakeStatsProvider;
import org.apache.aurora.scheduler.updater.JobUpdateController.AuditData;
import org.apache.aurora.scheduler.updater.StateEvaluator.Failure;
import org.easymock.EasyMock;
import org.easymock.IExpectationSetters;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import static org.apache.aurora.gen.JobUpdateAction.INSTANCE_ROLLBACK_FAILED;
import static org.apache.aurora.gen.JobUpdateAction.INSTANCE_ROLLED_BACK;
import static org.apache.aurora.gen.JobUpdateAction.INSTANCE_ROLLING_BACK;
import static org.apache.aurora.gen.JobUpdateAction.INSTANCE_UPDATED;
import static org.apache.aurora.gen.JobUpdateAction.INSTANCE_UPDATE_FAILED;
import static org.apache.aurora.gen.JobUpdateAction.INSTANCE_UPDATING;
import static org.apache.aurora.gen.JobUpdateStatus.ABORTED;
import static org.apache.aurora.gen.JobUpdateStatus.ERROR;
import static org.apache.aurora.gen.JobUpdateStatus.ROLLED_BACK;
import static org.apache.aurora.gen.JobUpdateStatus.ROLLED_FORWARD;
import static org.apache.aurora.gen.JobUpdateStatus.ROLLING_BACK;
import static org.apache.aurora.gen.JobUpdateStatus.ROLLING_FORWARD;
import static org.apache.aurora.gen.JobUpdateStatus.ROLL_BACK_PAUSED;
import static org.apache.aurora.gen.JobUpdateStatus.ROLL_FORWARD_AWAITING_PULSE;
import static org.apache.aurora.gen.JobUpdateStatus.ROLL_FORWARD_PAUSED;
import static org.apache.aurora.gen.ScheduleStatus.ASSIGNED;
import static org.apache.aurora.gen.ScheduleStatus.FAILED;
import static org.apache.aurora.gen.ScheduleStatus.FINISHED;
import static org.apache.aurora.gen.ScheduleStatus.KILLED;
import static org.apache.aurora.gen.ScheduleStatus.RUNNING;
import static org.apache.aurora.gen.ScheduleStatus.STARTING;
import static org.apache.aurora.scheduler.storage.Storage.MutateWork.NoResult;
import static org.apache.aurora.scheduler.testing.BatchWorkerUtil.expectBatchExecute;
import static org.apache.aurora.scheduler.updater.UpdateFactory.UpdateFactoryImpl.expandInstanceIds;
import static org.easymock.EasyMock.expectLastCall;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.fail;
public class JobUpdaterIT extends EasyMockTest {
private static final String USER = "user";
private static final AuditData AUDIT = new AuditData(USER, Optional.of("message"));
private static final IJobKey JOB = JobKeys.from("role", "env", "job1");
private static final IJobUpdateKey UPDATE_ID =
IJobUpdateKey.build(new JobUpdateKey(JOB.newBuilder(), "update_id"));
private static final Amount<Long, Time> WATCH_TIMEOUT = Amount.of(2000L, Time.MILLISECONDS);
private static final Amount<Long, Time> FLAPPING_THRESHOLD = Amount.of(1L, Time.MILLISECONDS);
private static final Amount<Long, Time> ONE_DAY = Amount.of(1L, Time.DAYS);
private static final Amount<Long, Time> ONE_HOUR = Amount.of(1L, Time.HOURS);
private static final Amount<Long, Time> ONE_MINUTE = Amount.of(1L, Time.MINUTES);
private static final ITaskConfig OLD_CONFIG =
setExecutorData(TaskTestUtil.makeConfig(JOB), "olddata");
private static final ITaskConfig NEW_CONFIG = setExecutorData(OLD_CONFIG, "newdata");
private static final long PULSE_TIMEOUT_MS = 10000;
private static final ImmutableSet<Metadata> METADATA = ImmutableSet.of(
new Metadata("k1", "v1"), new Metadata("k2", "v2"));
private FakeScheduledExecutor clock;
private JobUpdateController updater;
private Driver driver;
private EventBus eventBus;
private Storage storage;
private LockManager lockManager;
private StateManager stateManager;
private JobUpdateEventSubscriber subscriber;
private Command shutdownCommand;
private static ITaskConfig setExecutorData(ITaskConfig task, String executorData) {
TaskConfig builder = task.newBuilder();
builder.getExecutorConfig().setData(executorData);
return ITaskConfig.build(builder);
}
@Before
public void setUp() throws Exception {
// Avoid console spam due to stats registered multiple times.
Stats.flush();
ScheduledExecutorService executor = createMock(ScheduledExecutorService.class);
clock = FakeScheduledExecutor.scheduleExecutor(executor);
driver = createMock(Driver.class);
shutdownCommand = createMock(Command.class);
eventBus = new EventBus();
TaskEventBatchWorker batchWorker = createMock(TaskEventBatchWorker.class);
Injector injector = Guice.createInjector(
new UpdaterModule(executor, true),
DbModule.testModuleWithWorkQueue(),
new AbstractModule() {
@Override
protected void configure() {
bind(StatsProvider.class).toInstance(new FakeStatsProvider());
bind(Clock.class).toInstance(clock);
bind(StateManager.class).to(StateManagerImpl.class);
bind(Driver.class).toInstance(driver);
bind(TaskIdGenerator.class).to(TaskIdGeneratorImpl.class);
bind(RescheduleCalculator.class).to(RescheduleCalculatorImpl.class);
bind(RescheduleCalculatorImpl.RescheduleCalculatorSettings.class)
.toInstance(new RescheduleCalculatorImpl.RescheduleCalculatorSettings(
new TruncatedBinaryBackoff(
Amount.of(1L, Time.SECONDS), Amount.of(1L, Time.MINUTES)),
FLAPPING_THRESHOLD,
Amount.of(1, Time.MINUTES)));
bind(EventSink.class).toInstance(eventBus::post);
bind(LockManager.class).to(LockManagerImpl.class);
bind(UUIDGenerator.class).to(UUIDGeneratorImpl.class);
bind(Lifecycle.class).toInstance(new Lifecycle(shutdownCommand));
bind(TaskEventBatchWorker.class).toInstance(batchWorker);
}
});
updater = injector.getInstance(JobUpdateController.class);
storage = injector.getInstance(Storage.class);
storage.prepare();
lockManager = injector.getInstance(LockManager.class);
stateManager = injector.getInstance(StateManager.class);
eventBus.register(injector.getInstance(JobUpdateEventSubscriber.class));
subscriber = injector.getInstance(JobUpdateEventSubscriber.class);
expectBatchExecute(batchWorker, storage, control).anyTimes();
}
@After
public void validateExitState() {
clock.assertEmpty();
assertEquals(ImmutableList.of(), ImmutableList.copyOf(lockManager.getLocks()));
}
@Test(expected = UpdateStateException.class)
public void testJobLocked() throws Exception {
control.replay();
ILock lock = lockManager.acquireLock(ILockKey.build(LockKey.job(JOB.newBuilder())), USER);
try {
updater.start(makeJobUpdate(makeInstanceConfig(0, 0, NEW_CONFIG)), AUDIT);
} finally {
lockManager.releaseLock(lock);
}
}
private String getTaskId(IJobKey job, int instanceId) {
return Tasks.id(Iterables.getOnlyElement(
Storage.Util.fetchTasks(
storage,
Query.instanceScoped(job, instanceId).active())));
}
private void changeState(
IJobKey job,
int instanceId,
ScheduleStatus status,
ScheduleStatus... statuses) {
for (ScheduleStatus s
: ImmutableList.<ScheduleStatus>builder().add(status).add(statuses).build()) {
storage.write((NoResult.Quiet) storeProvider ->
assertEquals(
StateChangeResult.SUCCESS,
stateManager.changeState(
storeProvider,
getTaskId(job, instanceId),
Optional.absent(),
s,
Optional.absent())));
}
}
private static final Ordering<IJobInstanceUpdateEvent> EVENT_ORDER = Ordering.natural()
.onResultOf(new Function<IJobInstanceUpdateEvent, Long>() {
@Override
public Long apply(IJobInstanceUpdateEvent event) {
return event.getTimestampMs();
}
});
private static final Function<IJobInstanceUpdateEvent, Integer> EVENT_TO_INSTANCE =
IJobInstanceUpdateEvent::getInstanceId;
private IJobUpdateDetails getDetails() {
return storage.read(
storeProvider -> storeProvider.getJobUpdateStore().fetchJobUpdateDetails(UPDATE_ID).get());
}
private IJobUpdateDetails getDetails(IJobUpdateKey key) {
return storage.read(
storeProvider -> storeProvider.getJobUpdateStore().fetchJobUpdateDetails(key).get());
}
private void assertLatestUpdateMessage(String expected) {
IJobUpdateDetails details = getDetails();
assertEquals(expected, Iterables.getLast(details.getUpdateEvents()).getMessage());
}
private void assertState(
JobUpdateStatus expected,
Multimap<Integer, JobUpdateAction> expectedActions) {
assertStateUpdate(UPDATE_ID, expected, expectedActions);
}
private void assertStateUpdate(
IJobUpdateKey key,
JobUpdateStatus expected,
Multimap<Integer, JobUpdateAction> expectedActions) {
IJobUpdateDetails details = getDetails(key);
Iterable<IJobInstanceUpdateEvent> orderedEvents =
EVENT_ORDER.sortedCopy(details.getInstanceEvents());
Multimap<Integer, IJobInstanceUpdateEvent> eventsByInstance =
Multimaps.index(orderedEvents, EVENT_TO_INSTANCE);
Multimap<Integer, JobUpdateAction> actionsByInstance =
Multimaps.transformValues(eventsByInstance, JobUpdateControllerImpl.EVENT_TO_ACTION);
assertEquals(expectedActions, actionsByInstance);
assertEquals(expected, details.getUpdate().getSummary().getState().getStatus());
}
private IExpectationSetters<String> expectTaskKilled() {
driver.killTask(EasyMock.anyObject());
return expectLastCall();
}
private void insertPendingTasks(ITaskConfig task, Set<Integer> instanceIds) {
storage.write((NoResult.Quiet) storeProvider ->
stateManager.insertPendingTasks(storeProvider, task, instanceIds));
}
private ILock insertInProgressUpdate(IJobUpdate update) {
return storage.write(
storeProvider -> saveJobUpdate(storeProvider.getJobUpdateStore(), update, ROLLING_FORWARD));
}
private void insertInitialTasks(IJobUpdate update) {
storage.write((NoResult.Quiet) storeProvider -> {
for (IInstanceTaskConfig config : update.getInstructions().getInitialState()) {
insertPendingTasks(config.getTask(), expandInstanceIds(ImmutableSet.of(config)));
}
});
}
private void assertJobState(IJobKey job, Map<Integer, ITaskConfig> expected) {
Iterable<IScheduledTask> tasks =
Storage.Util.fetchTasks(storage, Query.jobScoped(job).active());
Map<Integer, IScheduledTask> tasksByInstance =
Maps.uniqueIndex(tasks, Tasks::getInstanceId);
assertEquals(
expected,
ImmutableMap.copyOf(Maps.transformValues(tasksByInstance, Tasks::getConfig)));
}
@Test
public void testSuccessfulUpdate() throws Exception {
expectTaskKilled();
control.replay();
IJobUpdate update = makeJobUpdate(
// No-op - task is already matching the new config.
makeInstanceConfig(0, 0, NEW_CONFIG),
// Task needing update.
makeInstanceConfig(2, 2, OLD_CONFIG));
insertInitialTasks(update);
changeState(JOB, 0, ASSIGNED, STARTING, RUNNING);
changeState(JOB, 2, ASSIGNED, STARTING, RUNNING);
clock.advance(WATCH_TIMEOUT);
ImmutableMultimap.Builder<Integer, JobUpdateAction> actions = ImmutableMultimap.builder();
// Instance 1 is added
updater.start(update, AUDIT);
actions.putAll(1, INSTANCE_UPDATING);
assertState(ROLLING_FORWARD, actions.build());
changeState(JOB, 1, ASSIGNED, STARTING, RUNNING);
// Updates may be paused for arbitrarily-long amounts of time, and the updater should not
// take action while paused.
updater.pause(UPDATE_ID, AUDIT);
updater.pause(UPDATE_ID, AUDIT); // Pausing again is a no-op.
assertState(ROLL_FORWARD_PAUSED, actions.build());
clock.advance(ONE_DAY);
changeState(JOB, 1, FAILED, ASSIGNED, STARTING, RUNNING);
changeState(JOB, 2, FAILED, ASSIGNED, STARTING, RUNNING);
clock.advance(WATCH_TIMEOUT);
updater.resume(UPDATE_ID, AUDIT);
actions.putAll(1, INSTANCE_UPDATED).put(2, INSTANCE_UPDATING);
assertState(ROLLING_FORWARD, actions.build());
// A task outside the scope of the update should be ignored by the updater.
insertPendingTasks(NEW_CONFIG, ImmutableSet.of(100));
// Instance 2 is updated
changeState(JOB, 2, KILLED, ASSIGNED, STARTING, RUNNING);
clock.advance(WATCH_TIMEOUT);
actions.put(2, INSTANCE_UPDATED);
assertState(ROLLED_FORWARD, actions.build());
assertJobState(
JOB,
ImmutableMap.of(0, NEW_CONFIG, 1, NEW_CONFIG, 2, NEW_CONFIG, 100, NEW_CONFIG));
// Attempting to abort a finished update should fail.
try {
updater.abort(UPDATE_ID, AUDIT);
fail("It should not be possible to abort a completed update.");
} catch (UpdateStateException e) {
// Expected.
}
}
@Test
public void testSuccessfulCoordinatedUpdate() throws Exception {
expectTaskKilled().times(2);
control.replay();
JobUpdate builder = makeJobUpdate(
// No-op - task is already matching the new config.
makeInstanceConfig(0, 0, NEW_CONFIG),
// Tasks needing update.
makeInstanceConfig(1, 2, OLD_CONFIG)).newBuilder();
builder.getInstructions().getSettings().setBlockIfNoPulsesAfterMs((int) PULSE_TIMEOUT_MS);
insertInitialTasks(IJobUpdate.build(builder));
changeState(JOB, 0, ASSIGNED, STARTING, RUNNING);
changeState(JOB, 1, ASSIGNED, STARTING, RUNNING);
changeState(JOB, 2, ASSIGNED, STARTING, RUNNING);
clock.advance(WATCH_TIMEOUT);
ImmutableMultimap.Builder<Integer, JobUpdateAction> actions = ImmutableMultimap.builder();
updater.start(IJobUpdate.build(builder), AUDIT);
// The update is blocked initially waiting for a pulse.
assertState(ROLL_FORWARD_AWAITING_PULSE, actions.build());
// Pulse arrives and update starts.
assertEquals(JobUpdatePulseStatus.OK, updater.pulse(UPDATE_ID));
changeState(JOB, 1, KILLED, ASSIGNED, STARTING, RUNNING);
actions.put(1, INSTANCE_UPDATING);
assertState(ROLLING_FORWARD, actions.build());
clock.advance(WATCH_TIMEOUT);
actions.put(1, INSTANCE_UPDATED);
// The update is blocked due to expired pulse timeout.
clock.advance(Amount.of(PULSE_TIMEOUT_MS, Time.MILLISECONDS));
actions.put(2, INSTANCE_UPDATING);
changeState(JOB, 2, KILLED);
assertState(ROLL_FORWARD_AWAITING_PULSE, actions.build());
assertLatestUpdateMessage(JobUpdateControllerImpl.PULSE_TIMEOUT_MESSAGE);
// Pulse arrives and instance 2 is updated.
assertEquals(JobUpdatePulseStatus.OK, updater.pulse(UPDATE_ID));
changeState(JOB, 2, ASSIGNED, STARTING, RUNNING);
clock.advance(WATCH_TIMEOUT);
actions.put(2, INSTANCE_UPDATED);
assertState(ROLLED_FORWARD, actions.build());
assertJobState(JOB, ImmutableMap.of(0, NEW_CONFIG, 1, NEW_CONFIG, 2, NEW_CONFIG));
assertEquals(JobUpdatePulseStatus.FINISHED, updater.pulse(UPDATE_ID));
}
@Test
public void testRecoverCoordinatedUpdateFromStorage() throws Exception {
expectTaskKilled().times(2);
control.replay();
JobUpdate builder =
setInstanceCount(makeJobUpdate(makeInstanceConfig(0, 1, OLD_CONFIG)), 2).newBuilder();
builder.getInstructions().getSettings().setBlockIfNoPulsesAfterMs((int) PULSE_TIMEOUT_MS);
IJobUpdate update = IJobUpdate.build(builder);
insertInitialTasks(update);
changeState(JOB, 0, ASSIGNED, STARTING, RUNNING);
changeState(JOB, 1, ASSIGNED, STARTING, RUNNING);
clock.advance(ONE_DAY);
storage.write(
storeProvider -> saveJobUpdate(storeProvider.getJobUpdateStore(), update, ROLLING_FORWARD));
clock.advance(ONE_MINUTE);
subscriber.startAsync().awaitRunning();
ImmutableMultimap.Builder<Integer, JobUpdateAction> actions = ImmutableMultimap.builder();
// The update is blocked initially waiting for a pulse.
assertState(ROLL_FORWARD_AWAITING_PULSE, actions.build());
assertEquals(JobUpdatePulseStatus.OK, updater.pulse(UPDATE_ID));
// Instance 0 is updated.
changeState(JOB, 0, KILLED, ASSIGNED, STARTING, RUNNING);
clock.advance(WATCH_TIMEOUT);
// Instance 1 is updated.
changeState(JOB, 1, KILLED, ASSIGNED, STARTING, RUNNING);
clock.advance(WATCH_TIMEOUT);
actions.putAll(0, INSTANCE_UPDATING, INSTANCE_UPDATED)
.putAll(1, INSTANCE_UPDATING, INSTANCE_UPDATED);
assertState(ROLLED_FORWARD, actions.build());
assertEquals(JobUpdatePulseStatus.FINISHED, updater.pulse(UPDATE_ID));
}
@Test
public void testRecoverLongPulseTimeoutCoordinatedUpdateFromStorage() throws Exception {
// A brief failover in the middle of a rolling forward update with a long pulse timeout should
// mean that after scheduler startup the update is not waiting for a pulse.
expectTaskKilled().times(1);
control.replay();
JobUpdate builder =
setInstanceCount(makeJobUpdate(makeInstanceConfig(0, 0, OLD_CONFIG)), 1).newBuilder();
builder.getInstructions().getSettings()
.setBlockIfNoPulsesAfterMs(Ints.checkedCast(ONE_HOUR.as(Time.MILLISECONDS)));
IJobUpdate update = IJobUpdate.build(builder);
insertInitialTasks(update);
changeState(JOB, 0, ASSIGNED, STARTING, RUNNING);
clock.advance(ONE_DAY);
storage.write(storeProvider ->
saveJobUpdate(storeProvider.getJobUpdateStore(), update, ROLL_FORWARD_AWAITING_PULSE));
// The first pulse comes after one minute
clock.advance(ONE_MINUTE);
storage.write(
(NoResult.Quiet) storeProvider ->
saveJobUpdateEvent(storeProvider.getJobUpdateStore(), update, ROLLING_FORWARD));
clock.advance(ONE_MINUTE);
subscriber.startAsync().awaitRunning();
ImmutableMultimap.Builder<Integer, JobUpdateAction> actions = ImmutableMultimap.builder();
actions.putAll(0, INSTANCE_UPDATING);
// Since the pulse interval is so large and the downtime was so short, the update does not need
// to wait for a pulse.
assertState(ROLLING_FORWARD, actions.build());
// Instance 0 is updated.
changeState(JOB, 0, KILLED, ASSIGNED, STARTING, RUNNING);
clock.advance(WATCH_TIMEOUT);
actions.putAll(0, INSTANCE_UPDATED);
assertState(ROLLED_FORWARD, actions.build());
assertEquals(JobUpdatePulseStatus.FINISHED, updater.pulse(UPDATE_ID));
}
@Test
public void testRecoverAwaitingPulseFromStorage() throws Exception {
expectTaskKilled();
control.replay();
JobUpdate builder =
setInstanceCount(makeJobUpdate(makeInstanceConfig(0, 0, OLD_CONFIG)), 1).newBuilder();
builder.getInstructions().getSettings().setBlockIfNoPulsesAfterMs((int) PULSE_TIMEOUT_MS);
IJobUpdate update = IJobUpdate.build(builder);
insertInitialTasks(update);
changeState(JOB, 0, ASSIGNED, STARTING, RUNNING);
clock.advance(ONE_DAY);
storage.write(storeProvider ->
saveJobUpdate(storeProvider.getJobUpdateStore(), update, ROLL_FORWARD_AWAITING_PULSE));
subscriber.startAsync().awaitRunning();
ImmutableMultimap.Builder<Integer, JobUpdateAction> actions = ImmutableMultimap.builder();
assertState(ROLL_FORWARD_AWAITING_PULSE, actions.build());
assertEquals(JobUpdatePulseStatus.OK, updater.pulse(UPDATE_ID));
changeState(JOB, 0, KILLED, ASSIGNED, STARTING, RUNNING);
clock.advance(WATCH_TIMEOUT);
actions.putAll(0, INSTANCE_UPDATING, INSTANCE_UPDATED);
assertState(ROLLED_FORWARD, actions.build());
assertEquals(JobUpdatePulseStatus.FINISHED, updater.pulse(UPDATE_ID));
}
@Test
public void testRecoverCoordinatedPausedFromStorage() throws Exception {
expectTaskKilled();
control.replay();
JobUpdate builder =
setInstanceCount(makeJobUpdate(makeInstanceConfig(0, 0, OLD_CONFIG)), 1).newBuilder();
builder.getInstructions().getSettings().setBlockIfNoPulsesAfterMs((int) PULSE_TIMEOUT_MS);
IJobUpdate update = IJobUpdate.build(builder);
insertInitialTasks(update);
changeState(JOB, 0, ASSIGNED, STARTING, RUNNING);
clock.advance(ONE_DAY);
storage.write(storeProvider ->
saveJobUpdate(storeProvider.getJobUpdateStore(), update, ROLL_FORWARD_PAUSED));
subscriber.startAsync().awaitRunning();
ImmutableMultimap.Builder<Integer, JobUpdateAction> actions = ImmutableMultimap.builder();
assertState(ROLL_FORWARD_PAUSED, actions.build());
assertEquals(JobUpdatePulseStatus.OK, updater.pulse(UPDATE_ID));
updater.resume(UPDATE_ID, AUDIT);
changeState(JOB, 0, KILLED, ASSIGNED, STARTING, RUNNING);
clock.advance(WATCH_TIMEOUT);
actions.putAll(0, INSTANCE_UPDATING, INSTANCE_UPDATED);
assertState(ROLLED_FORWARD, actions.build());
assertEquals(JobUpdatePulseStatus.FINISHED, updater.pulse(UPDATE_ID));
}
@Test
public void testResumeToAwaitingPulse() throws Exception {
expectTaskKilled().times(2);
control.replay();
JobUpdate builder =
setInstanceCount(makeJobUpdate(makeInstanceConfig(0, 1, OLD_CONFIG)), 2).newBuilder();
builder.getInstructions().getSettings().setBlockIfNoPulsesAfterMs((int) PULSE_TIMEOUT_MS);
IJobUpdate update = IJobUpdate.build(builder);
insertInitialTasks(update);
changeState(JOB, 0, ASSIGNED, STARTING, RUNNING);
changeState(JOB, 1, ASSIGNED, STARTING, RUNNING);
clock.advance(ONE_DAY);
ImmutableMultimap.Builder<Integer, JobUpdateAction> actions = ImmutableMultimap.builder();
updater.start(IJobUpdate.build(builder), AUDIT);
// The update is blocked initially waiting for a pulse.
assertState(ROLL_FORWARD_AWAITING_PULSE, actions.build());
// Pause the awaiting pulse update.
updater.pause(UPDATE_ID, AUDIT);
assertState(ROLL_FORWARD_PAUSED, actions.build());
// Resume into awaiting pulse state.
updater.resume(UPDATE_ID, AUDIT);
assertState(ROLL_FORWARD_AWAITING_PULSE, actions.build());
assertEquals(JobUpdatePulseStatus.OK, updater.pulse(UPDATE_ID));
// Instance 0 is updated.
changeState(JOB, 0, KILLED, ASSIGNED, STARTING, RUNNING);
clock.advance(WATCH_TIMEOUT);
// Instance 1 is updated.
changeState(JOB, 1, KILLED, ASSIGNED, STARTING, RUNNING);
clock.advance(WATCH_TIMEOUT);
actions.putAll(0, INSTANCE_UPDATING, INSTANCE_UPDATED)
.putAll(1, INSTANCE_UPDATING, INSTANCE_UPDATED);
assertState(ROLLED_FORWARD, actions.build());
assertEquals(JobUpdatePulseStatus.FINISHED, updater.pulse(UPDATE_ID));
}
@Test
public void testPulsePausedUpdate() throws Exception {
expectTaskKilled().times(2);
control.replay();
JobUpdate builder = makeJobUpdate(
// No-op - task is already matching the new config.
makeInstanceConfig(0, 0, NEW_CONFIG),
// Tasks needing update.
makeInstanceConfig(1, 2, OLD_CONFIG)).newBuilder();
builder.getInstructions().getSettings().setBlockIfNoPulsesAfterMs((int) PULSE_TIMEOUT_MS);
insertInitialTasks(IJobUpdate.build(builder));
changeState(JOB, 0, ASSIGNED, STARTING, RUNNING);
changeState(JOB, 1, ASSIGNED, STARTING, RUNNING);
changeState(JOB, 2, ASSIGNED, STARTING, RUNNING);
clock.advance(WATCH_TIMEOUT);
ImmutableMultimap.Builder<Integer, JobUpdateAction> actions = ImmutableMultimap.builder();
updater.start(IJobUpdate.build(builder), AUDIT);
// The update is blocked initially waiting for a pulse.
assertState(ROLL_FORWARD_AWAITING_PULSE, actions.build());
// Pulse arrives and update starts.
assertEquals(JobUpdatePulseStatus.OK, updater.pulse(UPDATE_ID));
changeState(JOB, 1, KILLED, ASSIGNED, STARTING, RUNNING);
actions.put(1, INSTANCE_UPDATING);
clock.advance(WATCH_TIMEOUT);
actions.put(1, INSTANCE_UPDATED);
actions.put(2, INSTANCE_UPDATING);
clock.advance(Amount.of(PULSE_TIMEOUT_MS, Time.MILLISECONDS));
// Update is paused
updater.pause(UPDATE_ID, AUDIT);
assertState(ROLL_FORWARD_PAUSED, actions.build());
// A paused update is pulsed.
assertEquals(JobUpdatePulseStatus.OK, updater.pulse(UPDATE_ID));
// Update is resumed
updater.resume(UPDATE_ID, AUDIT);
assertState(ROLLING_FORWARD, actions.build());
// Instance 2 is updated.
changeState(JOB, 2, KILLED, ASSIGNED, STARTING, RUNNING);
clock.advance(WATCH_TIMEOUT);
actions.put(2, INSTANCE_UPDATED);
assertState(ROLLED_FORWARD, actions.build());
assertJobState(JOB, ImmutableMap.of(0, NEW_CONFIG, 1, NEW_CONFIG, 2, NEW_CONFIG));
assertEquals(JobUpdatePulseStatus.FINISHED, updater.pulse(UPDATE_ID));
}
@Test
public void testUnblockDeletedUpdate() throws Exception {
control.replay();
JobUpdate builder =
setInstanceCount(makeJobUpdate(makeInstanceConfig(0, 1, OLD_CONFIG)), 2).newBuilder();
builder.getInstructions().getSettings().setBlockIfNoPulsesAfterMs((int) PULSE_TIMEOUT_MS);
IJobUpdate update = IJobUpdate.build(builder);
insertInitialTasks(update);
changeState(JOB, 0, ASSIGNED, STARTING, RUNNING);
changeState(JOB, 1, ASSIGNED, STARTING, RUNNING);
clock.advance(ONE_DAY);
storage.write(
storeProvider -> saveJobUpdate(storeProvider.getJobUpdateStore(), update, ROLLING_FORWARD));
clock.advance(ONE_MINUTE);
subscriber.startAsync().awaitRunning();
ImmutableMultimap.Builder<Integer, JobUpdateAction> actions = ImmutableMultimap.builder();
// The update is blocked initially waiting for a pulse.
assertState(ROLL_FORWARD_AWAITING_PULSE, actions.build());
storage.write((NoResult.Quiet) storeProvider -> {
storeProvider.getJobUpdateStore().deleteAllUpdatesAndEvents();
releaseAllLocks();
});
// The pulse still returns OK but the error is handled.
assertEquals(JobUpdatePulseStatus.OK, updater.pulse(UPDATE_ID));
}
@Test
public void testPulseInvalidUpdateId() throws Exception {
control.replay();
assertEquals(
JobUpdatePulseStatus.FINISHED,
updater.pulse(IJobUpdateKey.build(new JobUpdateKey(JOB.newBuilder(), "invalid"))));
}
@Test(expected = IllegalStateException.class)
public void testShutdownOnFailedPulse() throws Exception {
// Missing kill expectation will trigger failure.
shutdownCommand.execute();
expectLastCall().andAnswer(() -> {
storage.write((NoResult.Quiet) storeProvider -> releaseAllLocks());
throw new IllegalStateException("Expected shutdown triggered.");
});
control.replay();
JobUpdate builder = makeJobUpdate(
// No-op - task is already matching the new config.
makeInstanceConfig(0, 0, NEW_CONFIG),
// Tasks needing update.
makeInstanceConfig(1, 2, OLD_CONFIG)).newBuilder();
builder.getInstructions().getSettings().setBlockIfNoPulsesAfterMs((int) PULSE_TIMEOUT_MS);
insertInitialTasks(IJobUpdate.build(builder));
changeState(JOB, 0, ASSIGNED, STARTING, RUNNING);
changeState(JOB, 1, ASSIGNED, STARTING, RUNNING);
changeState(JOB, 2, ASSIGNED, STARTING, RUNNING);
clock.advance(WATCH_TIMEOUT);
ImmutableMultimap.Builder<Integer, JobUpdateAction> actions = ImmutableMultimap.builder();
updater.start(IJobUpdate.build(builder), AUDIT);
// The update is blocked initially waiting for a pulse.
assertState(ROLL_FORWARD_AWAITING_PULSE, actions.build());
// Pulse arrives and update starts.
assertEquals(JobUpdatePulseStatus.OK, updater.pulse(UPDATE_ID));
changeState(JOB, 1, KILLED, ASSIGNED, STARTING, RUNNING);
}
@Test
public void testSuccessfulBatchedUpdate() throws Exception {
expectTaskKilled().times(3);
control.replay();
JobUpdate builder = makeJobUpdate(makeInstanceConfig(0, 2, OLD_CONFIG)).newBuilder();
builder.getInstructions().getSettings()
.setWaitForBatchCompletion(true)
.setUpdateGroupSize(2);
IJobUpdate update = IJobUpdate.build(builder);
insertInitialTasks(update);
changeState(JOB, 0, ASSIGNED, STARTING, RUNNING);
changeState(JOB, 1, ASSIGNED, STARTING, RUNNING);
changeState(JOB, 2, ASSIGNED, STARTING, RUNNING);
clock.advance(WATCH_TIMEOUT);
ImmutableMultimap.Builder<Integer, JobUpdateAction> actions = ImmutableMultimap.builder();
// Instances 0 and 1 are updated.
updater.start(update, AUDIT);
actions.putAll(0, INSTANCE_UPDATING)
.putAll(1, INSTANCE_UPDATING);
assertState(ROLLING_FORWARD, actions.build());
changeState(JOB, 1, FINISHED, ASSIGNED, STARTING, RUNNING);
clock.advance(Amount.of(WATCH_TIMEOUT.getValue() / 2, Time.MILLISECONDS));
changeState(JOB, 0, FINISHED, ASSIGNED, STARTING, RUNNING);
clock.advance(Amount.of(WATCH_TIMEOUT.getValue() / 2, Time.MILLISECONDS));
// Instance 1 finished first, but update does not yet proceed until 0 finishes.
actions.putAll(1, INSTANCE_UPDATED);
assertState(ROLLING_FORWARD, actions.build());
clock.advance(WATCH_TIMEOUT);
actions.putAll(0, INSTANCE_UPDATED);
// Instance 2 is updated.
changeState(JOB, 2, FINISHED, ASSIGNED, STARTING, RUNNING);
clock.advance(WATCH_TIMEOUT);
actions.putAll(2, INSTANCE_UPDATING, INSTANCE_UPDATED);
assertState(ROLLED_FORWARD, actions.build());
assertJobState(
JOB,
ImmutableMap.of(0, NEW_CONFIG, 1, NEW_CONFIG, 2, NEW_CONFIG));
}
@Test
public void testUpdateSpecificInstances() throws Exception {
expectTaskKilled();
control.replay();
JobUpdate builder =
setInstanceCount(makeJobUpdate(makeInstanceConfig(0, 0, OLD_CONFIG)), 1).newBuilder();
builder.getInstructions().getSettings().setUpdateOnlyTheseInstances(
ImmutableSet.of(new Range(0, 0)));
IJobUpdate update = IJobUpdate.build(builder);
insertPendingTasks(OLD_CONFIG, ImmutableSet.of(0, 1));
changeState(JOB, 0, ASSIGNED, STARTING, RUNNING);
changeState(JOB, 1, ASSIGNED, STARTING, RUNNING);
clock.advance(WATCH_TIMEOUT);
// Instance 0 is updated
updater.start(update, AUDIT);
changeState(JOB, 0, KILLED, ASSIGNED, STARTING, RUNNING);
clock.advance(WATCH_TIMEOUT);
ImmutableMultimap.Builder<Integer, JobUpdateAction> actions = ImmutableMultimap.builder();
assertState(
ROLLED_FORWARD,
actions.putAll(0, INSTANCE_UPDATING, INSTANCE_UPDATED).build());
assertJobState(
JOB,
ImmutableMap.of(0, NEW_CONFIG, 1, OLD_CONFIG));
}
@Test
public void testUpdateSpecificInstancesSkipUnchanged() throws Exception {
control.replay();
JobUpdate builder = makeJobUpdate().newBuilder();
builder.getInstructions().getDesiredState().setInstances(ImmutableSet.of(new Range(1, 1)));
builder.getInstructions().getSettings().setUpdateOnlyTheseInstances(
ImmutableSet.of(new Range(0, 1)));
IJobUpdate update = IJobUpdate.build(builder);
insertPendingTasks(NEW_CONFIG, ImmutableSet.of(0));
insertPendingTasks(OLD_CONFIG, ImmutableSet.of(2));
changeState(JOB, 0, ASSIGNED, STARTING, RUNNING);
changeState(JOB, 2, ASSIGNED, STARTING, RUNNING);
clock.advance(WATCH_TIMEOUT);
// Instance 1 is added, while instance 0 is skipped
updater.start(update, AUDIT);
changeState(JOB, 1, ASSIGNED, STARTING, RUNNING);
clock.advance(WATCH_TIMEOUT);
ImmutableMultimap.Builder<Integer, JobUpdateAction> actions = ImmutableMultimap.builder();
assertState(
ROLLED_FORWARD,
actions.putAll(1, INSTANCE_UPDATING, INSTANCE_UPDATED).build());
assertJobState(
JOB,
ImmutableMap.of(0, NEW_CONFIG, 1, NEW_CONFIG, 2, OLD_CONFIG));
}
@Test
public void testRollback() throws Exception {
expectTaskKilled().times(4);
control.replay();
IJobUpdate update = makeJobUpdate(
makeInstanceConfig(0, 0, OLD_CONFIG),
makeInstanceConfig(2, 3, OLD_CONFIG));
insertInitialTasks(update);
changeState(JOB, 0, ASSIGNED, STARTING, RUNNING);
changeState(JOB, 2, ASSIGNED, STARTING, RUNNING);
changeState(JOB, 3, ASSIGNED, STARTING, RUNNING);
clock.advance(WATCH_TIMEOUT);
ImmutableMultimap.Builder<Integer, JobUpdateAction> actions = ImmutableMultimap.builder();
// Instance 0 is updated.
updater.start(update, AUDIT);
actions.putAll(0, INSTANCE_UPDATING);
assertState(ROLLING_FORWARD, actions.build());
changeState(JOB, 0, KILLED, ASSIGNED, STARTING, RUNNING);
clock.advance(WATCH_TIMEOUT);
// Instance 1 is added.
changeState(JOB, 1, ASSIGNED, STARTING, RUNNING);
actions.putAll(0, INSTANCE_UPDATED)
.putAll(1, INSTANCE_UPDATING, INSTANCE_UPDATED);
clock.advance(WATCH_TIMEOUT);
// Instance 2 is updated, but fails.
changeState(JOB, 2, KILLED, ASSIGNED, STARTING, RUNNING);
actions.putAll(2, INSTANCE_UPDATING, INSTANCE_UPDATE_FAILED, INSTANCE_ROLLING_BACK);
clock.advance(FLAPPING_THRESHOLD);
changeState(JOB, 2, FAILED);
// Instance 2 is rolled back.
assertState(ROLLING_BACK, actions.build());
assertLatestUpdateMessage(JobUpdateControllerImpl.failureMessage(2, Failure.EXITED));
changeState(JOB, 2, ASSIGNED, STARTING, RUNNING);
actions.putAll(1, INSTANCE_ROLLING_BACK)
.putAll(2, INSTANCE_ROLLED_BACK);
clock.advance(WATCH_TIMEOUT);
// A rollback may be paused.
updater.pause(UPDATE_ID, AUDIT);
assertState(ROLL_BACK_PAUSED, actions.build());
clock.advance(ONE_DAY);
updater.resume(UPDATE_ID, AUDIT);
assertState(ROLLING_BACK, actions.build());
// Instance 1 is removed.
changeState(JOB, 1, KILLED);
actions.putAll(1, INSTANCE_ROLLED_BACK);
clock.advance(WATCH_TIMEOUT);
// Instance 0 is rolled back.
changeState(JOB, 0, KILLED, ASSIGNED, STARTING, RUNNING);
actions.putAll(0, INSTANCE_ROLLING_BACK, INSTANCE_ROLLED_BACK);
clock.advance(WATCH_TIMEOUT);
assertState(ROLLED_BACK, actions.build());
assertJobState(JOB, ImmutableMap.of(0, OLD_CONFIG, 2, OLD_CONFIG, 3, OLD_CONFIG));
}
@Test
public void testRollbackDisabled() throws Exception {
expectTaskKilled().times(2);
control.replay();
JobUpdate builder = makeJobUpdate(
makeInstanceConfig(0, 0, OLD_CONFIG),
makeInstanceConfig(2, 3, OLD_CONFIG))
.newBuilder();
builder.getInstructions().getSettings().setRollbackOnFailure(false);
IJobUpdate update = IJobUpdate.build(builder);
insertInitialTasks(update);
changeState(JOB, 0, ASSIGNED, STARTING, RUNNING);
changeState(JOB, 2, ASSIGNED, STARTING, RUNNING);
changeState(JOB, 3, ASSIGNED, STARTING, RUNNING);
clock.advance(WATCH_TIMEOUT);
ImmutableMultimap.Builder<Integer, JobUpdateAction> actions = ImmutableMultimap.builder();
// Instance 0 is updated.
updater.start(update, AUDIT);
actions.putAll(0, INSTANCE_UPDATING);
assertState(ROLLING_FORWARD, actions.build());
changeState(JOB, 0, KILLED, ASSIGNED, STARTING, RUNNING);
clock.advance(WATCH_TIMEOUT);
// Instance 1 is added.
changeState(JOB, 1, ASSIGNED, STARTING, RUNNING);
actions.putAll(0, INSTANCE_UPDATED)
.putAll(1, INSTANCE_UPDATING, INSTANCE_UPDATED);
clock.advance(WATCH_TIMEOUT);
// Instance 2 is updated, but fails.
changeState(JOB, 2, KILLED, ASSIGNED, STARTING, RUNNING);
actions.putAll(2, INSTANCE_UPDATING, INSTANCE_UPDATE_FAILED);
clock.advance(FLAPPING_THRESHOLD);
changeState(JOB, 2, FAILED);
clock.advance(WATCH_TIMEOUT);
// Rollback is disabled, update fails.
assertState(JobUpdateStatus.FAILED, actions.build());
}
@Test
public void testAbort() throws Exception {
expectTaskKilled();
control.replay();
IJobUpdate update = makeJobUpdate(makeInstanceConfig(0, 2, OLD_CONFIG));
insertInitialTasks(update);
changeState(JOB, 0, ASSIGNED, STARTING, RUNNING);
changeState(JOB, 2, ASSIGNED, STARTING, RUNNING);
clock.advance(WATCH_TIMEOUT);
// Instance 0 is updated
updater.start(update, AUDIT);
changeState(JOB, 0, KILLED, ASSIGNED, STARTING, RUNNING);
clock.advance(WATCH_TIMEOUT);
ImmutableMultimap.Builder<Integer, JobUpdateAction> actions = ImmutableMultimap.builder();
actions.putAll(0, INSTANCE_UPDATING, INSTANCE_UPDATED)
.putAll(1, INSTANCE_UPDATING);
updater.abort(UPDATE_ID, AUDIT);
assertState(ABORTED, actions.build());
clock.advance(WATCH_TIMEOUT);
assertJobState(JOB, ImmutableMap.of(0, NEW_CONFIG, 1, NEW_CONFIG, 2, OLD_CONFIG));
}
@Test
public void testRollbackFailed() throws Exception {
expectTaskKilled().times(2);
control.replay();
IJobUpdate update = makeJobUpdate(
makeInstanceConfig(0, 1, OLD_CONFIG));
insertInitialTasks(update);
changeState(JOB, 0, ASSIGNED, STARTING, RUNNING);
changeState(JOB, 1, ASSIGNED, STARTING, RUNNING);
clock.advance(WATCH_TIMEOUT);
ImmutableMultimap.Builder<Integer, JobUpdateAction> actions = ImmutableMultimap.builder();
// Instance 0 is updated.
updater.start(update, AUDIT);
actions.putAll(0, INSTANCE_UPDATING);
assertState(ROLLING_FORWARD, actions.build());
changeState(JOB, 0, KILLED, ASSIGNED, STARTING, RUNNING);
clock.advance(WATCH_TIMEOUT);
// Instance 1 is updated, but fails.
changeState(JOB, 1, KILLED, ASSIGNED, STARTING, RUNNING);
clock.advance(FLAPPING_THRESHOLD);
changeState(JOB, 1, FAILED);
// Instance 1 is rolled back, but fails.
actions.putAll(0, INSTANCE_UPDATED)
.putAll(1, INSTANCE_UPDATING, INSTANCE_UPDATE_FAILED, INSTANCE_ROLLING_BACK);
assertState(ROLLING_BACK, actions.build());
changeState(JOB, 1, ASSIGNED, STARTING, RUNNING);
clock.advance(FLAPPING_THRESHOLD);
changeState(JOB, 1, FAILED);
actions.putAll(1, INSTANCE_ROLLBACK_FAILED);
assertState(JobUpdateStatus.FAILED, actions.build());
clock.advance(WATCH_TIMEOUT);
assertJobState(JOB, ImmutableMap.of(0, NEW_CONFIG, 1, OLD_CONFIG));
}
private void releaseAllLocks() {
for (ILock lock : lockManager.getLocks()) {
lockManager.releaseLock(lock);
}
}
@Test
public void testLostLock() throws Exception {
expectTaskKilled();
control.replay();
IJobUpdate update = makeJobUpdate(
makeInstanceConfig(0, 1, OLD_CONFIG));
insertInitialTasks(update);
changeState(JOB, 0, ASSIGNED, STARTING, RUNNING);
changeState(JOB, 1, ASSIGNED, STARTING, RUNNING);
clock.advance(WATCH_TIMEOUT);
// Instance 0 is updated.
updater.start(update, AUDIT);
releaseAllLocks();
changeState(JOB, 0, KILLED);
ImmutableMultimap.Builder<Integer, JobUpdateAction> actions = ImmutableMultimap.builder();
actions.putAll(0, INSTANCE_UPDATING);
assertState(ERROR, actions.build());
assertLatestUpdateMessage(JobUpdateControllerImpl.LOST_LOCK_MESSAGE);
}
private void expectInvalid(JobUpdate update)
throws UpdateStateException, UpdateConfigurationException {
try {
updater.start(IJobUpdate.build(update), AUDIT);
fail();
} catch (IllegalArgumentException e) {
// Expected.
}
}
@Test
public void testStartInvalidUpdate() throws Exception {
control.replay();
JobUpdate update = makeJobUpdate().newBuilder();
update.getInstructions().getSettings().setUpdateGroupSize(-1);
expectInvalid(update);
update = makeJobUpdate().newBuilder();
update.getInstructions().getSettings().setMinWaitInInstanceRunningMs(-1);
expectInvalid(update);
}
@Test
public void testConfigurationPolicyChange() throws Exception {
// Simulates a change in input validation after a job update has been persisted.
expectTaskKilled().times(2);
control.replay();
IJobUpdate update = setInstanceCount(makeJobUpdate(makeInstanceConfig(0, 1, OLD_CONFIG)), 2);
insertInitialTasks(update);
changeState(JOB, 0, ASSIGNED, STARTING, RUNNING);
changeState(JOB, 1, ASSIGNED, STARTING, RUNNING);
clock.advance(WATCH_TIMEOUT);
ImmutableMultimap.Builder<Integer, JobUpdateAction> actions = ImmutableMultimap.builder();
// Instance 0 is updated
updater.start(update, AUDIT);
actions.putAll(0, INSTANCE_UPDATING);
assertState(ROLLING_FORWARD, actions.build());
storage.write((NoResult.Quiet) storeProvider -> {
JobUpdateStore.Mutable store = storeProvider.getJobUpdateStore();
store.deleteAllUpdatesAndEvents();
JobUpdate builder = update.newBuilder();
builder.getInstructions().getSettings().setUpdateGroupSize(0);
for (ILock lock : lockManager.getLocks()) {
lockManager.releaseLock(lock);
}
saveJobUpdate(store, IJobUpdate.build(builder), ROLLING_FORWARD);
});
changeState(JOB, 0, KILLED, ASSIGNED, STARTING, RUNNING);
clock.advance(WATCH_TIMEOUT);
// Instance 1 is updated, but fails.
changeState(JOB, 1, KILLED, ASSIGNED, STARTING, RUNNING, FAILED);
// Actions is reset here since we wiped the updates tables earlier in the test case.
actions = ImmutableMultimap.builder();
actions.putAll(0, INSTANCE_UPDATED)
.putAll(1, INSTANCE_UPDATING, INSTANCE_UPDATE_FAILED);
clock.advance(WATCH_TIMEOUT);
assertState(ERROR, actions.build());
}
private ILock saveJobUpdate(
JobUpdateStore.Mutable store,
IJobUpdate update,
JobUpdateStatus status) {
ILock lock;
try {
lock = lockManager.acquireLock(
ILockKey.build(LockKey.job(update.getSummary().getKey().getJob().newBuilder())), USER);
} catch (LockManager.LockException e) {
throw Throwables.propagate(e);
}
store.saveJobUpdate(update, Optional.of(lock.getToken()));
saveJobUpdateEvent(store, update, status);
return lock;
}
private void saveJobUpdateEvent(
JobUpdateStore.Mutable store,
IJobUpdate update,
JobUpdateStatus status) {
store.saveJobUpdateEvent(
update.getSummary().getKey(),
IJobUpdateEvent.build(
new JobUpdateEvent()
.setStatus(status)
.setTimestampMs(clock.nowMillis())));
}
@Test
public void testRecoverFromStorage() throws Exception {
expectTaskKilled().times(2);
control.replay();
IJobUpdate update = setInstanceCount(makeJobUpdate(makeInstanceConfig(0, 1, OLD_CONFIG)), 2);
insertInitialTasks(update);
changeState(JOB, 0, ASSIGNED, STARTING, RUNNING);
changeState(JOB, 1, ASSIGNED, STARTING, RUNNING);
clock.advance(ONE_DAY);
storage.write(
storeProvider -> saveJobUpdate(storeProvider.getJobUpdateStore(), update, ROLLING_FORWARD));
subscriber.startAsync().awaitRunning();
// Instance 0 is updated.
changeState(JOB, 0, KILLED, ASSIGNED, STARTING, RUNNING);
clock.advance(WATCH_TIMEOUT);
// Instance 1 is updated.
changeState(JOB, 1, KILLED, ASSIGNED, STARTING, RUNNING);
clock.advance(WATCH_TIMEOUT);
ImmutableMultimap.Builder<Integer, JobUpdateAction> actions = ImmutableMultimap.builder();
actions.putAll(0, INSTANCE_UPDATING, INSTANCE_UPDATED)
.putAll(1, INSTANCE_UPDATING, INSTANCE_UPDATED);
assertState(ROLLED_FORWARD, actions.build());
}
@Test
public void testSystemResumeNoLock() throws Exception {
control.replay();
IJobUpdate update = setInstanceCount(makeJobUpdate(makeInstanceConfig(0, 1, OLD_CONFIG)), 0);
storage.write((NoResult.Quiet) storeProvider -> {
ILock lock = saveJobUpdate(storeProvider.getJobUpdateStore(), update, ROLLING_FORWARD);
lockManager.releaseLock(lock);
});
subscriber.startAsync().awaitRunning();
assertState(ERROR, ImmutableMultimap.of());
}
@Test
public void testImmediatelySuccessfulUpdate() throws Exception {
control.replay();
IJobUpdate update = makeJobUpdate(makeInstanceConfig(0, 2, NEW_CONFIG));
insertInitialTasks(update);
changeState(JOB, 0, ASSIGNED, STARTING, RUNNING);
changeState(JOB, 1, ASSIGNED, STARTING, RUNNING);
changeState(JOB, 2, ASSIGNED, STARTING, RUNNING);
clock.advance(ONE_DAY);
updater.start(update, AUDIT);
}
@Test(expected = IllegalArgumentException.class)
public void testNoopUpdateEmptyDiff() throws Exception {
control.replay();
IJobUpdate update = makeJobUpdate();
JobUpdate builder = update.newBuilder();
builder.getInstructions().unsetDesiredState();
updater.start(IJobUpdate.build(builder), AUDIT);
}
@Test
public void testSlowToScheduleTask() throws Exception {
expectTaskKilled().times(2);
control.replay();
IJobUpdate update = setInstanceCount(makeJobUpdate(makeInstanceConfig(0, 1, OLD_CONFIG)), 2);
insertInitialTasks(update);
changeState(JOB, 0, ASSIGNED, STARTING, RUNNING);
changeState(JOB, 1, ASSIGNED, STARTING, RUNNING);
ImmutableMultimap.Builder<Integer, JobUpdateAction> actions = ImmutableMultimap.builder();
// Instance 0 is updated.
updater.start(update, AUDIT);
actions.putAll(0, INSTANCE_UPDATING);
assertState(ROLLING_FORWARD, actions.build());
changeState(JOB, 0, KILLED, ASSIGNED, STARTING, RUNNING);
clock.advance(WATCH_TIMEOUT);
// Instance 1 is not advancing past PENDING.
changeState(JOB, 1, KILLED);
actions.putAll(0, INSTANCE_UPDATED)
.putAll(1, INSTANCE_UPDATING);
assertState(ROLLING_FORWARD, actions.build());
updater.abort(update.getSummary().getKey(), AUDIT);
assertState(ABORTED, actions.build());
}
@Test
public void testAddInstances() throws Exception {
control.replay();
IJobUpdate update = makeJobUpdate();
insertPendingTasks(NEW_CONFIG, ImmutableSet.of(0, 1));
changeState(JOB, 0, ASSIGNED, STARTING, RUNNING);
changeState(JOB, 1, ASSIGNED, STARTING, RUNNING);
clock.advance(WATCH_TIMEOUT);
ImmutableMultimap.Builder<Integer, JobUpdateAction> actions = ImmutableMultimap.builder();
// Instance 2 is added
updater.start(update, AUDIT);
actions.putAll(2, INSTANCE_UPDATING);
assertState(ROLLING_FORWARD, actions.build());
changeState(JOB, 2, ASSIGNED, STARTING, RUNNING);
clock.advance(WATCH_TIMEOUT);
actions.putAll(2, INSTANCE_UPDATED);
assertState(ROLLED_FORWARD, actions.build());
assertJobState(
JOB,
ImmutableMap.of(0, NEW_CONFIG, 1, NEW_CONFIG, 2, NEW_CONFIG));
}
@Test
public void testRemoveInstances() throws Exception {
expectTaskKilled();
control.replay();
// Set instance count such that instance 1 is removed.
IJobUpdate update = setInstanceCount(makeJobUpdate(makeInstanceConfig(0, 1, NEW_CONFIG)), 1);
insertInitialTasks(update);
changeState(JOB, 0, ASSIGNED, STARTING, RUNNING);
changeState(JOB, 1, ASSIGNED, STARTING, RUNNING);
clock.advance(WATCH_TIMEOUT);
ImmutableMultimap.Builder<Integer, JobUpdateAction> actions = ImmutableMultimap.builder();
// Instance 1 is removed.
updater.start(update, AUDIT);
actions.putAll(1, INSTANCE_UPDATING);
changeState(JOB, 1, KILLED);
clock.advance(WATCH_TIMEOUT);
actions.put(1, INSTANCE_UPDATED);
assertState(ROLLED_FORWARD, actions.build());
assertJobState(JOB, ImmutableMap.of(0, NEW_CONFIG));
}
@Test
public void testBadPubsubUpdate() {
control.replay();
subscriber.taskChangedState(
PubsubEvent.TaskStateChange.transition(IScheduledTask.build(new ScheduledTask()), RUNNING));
}
@Test(expected = UpdateStateException.class)
public void testPauseUnknownUpdate() throws Exception {
control.replay();
updater.pause(UPDATE_ID, AUDIT);
}
@Test
public void testAbortAfterLostLock() throws Exception {
expectTaskKilled();
control.replay();
IJobUpdate update = makeJobUpdate(makeInstanceConfig(0, 0, OLD_CONFIG));
insertInitialTasks(update);
changeState(JOB, 0, ASSIGNED, STARTING, RUNNING);
clock.advance(WATCH_TIMEOUT);
ImmutableMultimap.Builder<Integer, JobUpdateAction> actions = ImmutableMultimap.builder();
updater.start(update, AUDIT);
actions.putAll(0, INSTANCE_UPDATING);
assertState(ROLLING_FORWARD, actions.build());
releaseAllLocks();
updater.abort(update.getSummary().getKey(), AUDIT);
clock.advance(WATCH_TIMEOUT);
assertState(ERROR, actions.build());
}
@Test
public void testStartUpdateAfterPausedAndLockLost() throws Exception {
// Tests for regression of AURORA-1023, in which a user could paint themselves into a corner
// by starting an update, pausing it, and forcibly releasing the job lock. The result in this
// behavior should be to prevent further job updates until the user aborts the first one.
expectTaskKilled();
control.replay();
IJobUpdate update = makeJobUpdate(makeInstanceConfig(0, 0, OLD_CONFIG));
insertInitialTasks(update);
changeState(JOB, 0, ASSIGNED, STARTING, RUNNING);
clock.advance(WATCH_TIMEOUT);
ImmutableMultimap.Builder<Integer, JobUpdateAction> actions = ImmutableMultimap.builder();
updater.start(update, AUDIT);
actions.putAll(0, INSTANCE_UPDATING);
assertState(ROLLING_FORWARD, actions.build());
updater.pause(update.getSummary().getKey(), AUDIT);
assertState(ROLL_FORWARD_PAUSED, actions.build());
clock.advance(WATCH_TIMEOUT);
releaseAllLocks();
JobUpdate builder = makeJobUpdate(makeInstanceConfig(0, 0, OLD_CONFIG)).newBuilder();
builder.getSummary().getKey().setId("another update");
IJobUpdate update2 = IJobUpdate.build(builder);
try {
updater.start(update2, AUDIT);
fail();
} catch (UpdateStateException e) {
// Expected.
}
}
@Test(expected = UpdateStateException.class)
public void testResumeUnknownUpdate() throws Exception {
control.replay();
updater.resume(UPDATE_ID, AUDIT);
}
@Test
public void testFailToRollbackCompletedUpdate() throws Exception {
expectTaskKilled().times(3);
control.replay();
JobUpdate builder = makeJobUpdate(makeInstanceConfig(0, 2, OLD_CONFIG)).newBuilder();
builder.getInstructions().getSettings()
.setWaitForBatchCompletion(true)
.setUpdateGroupSize(2);
IJobUpdate update = IJobUpdate.build(builder);
insertInitialTasks(update);
changeState(JOB, 0, ASSIGNED, STARTING, RUNNING);
changeState(JOB, 1, ASSIGNED, STARTING, RUNNING);
changeState(JOB, 2, ASSIGNED, STARTING, RUNNING);
clock.advance(WATCH_TIMEOUT);
ImmutableMultimap.Builder<Integer, JobUpdateAction> actions = ImmutableMultimap.builder();
// Instances 0 and 1 are updated.
updater.start(update, AUDIT);
actions.putAll(0, INSTANCE_UPDATING)
.putAll(1, INSTANCE_UPDATING);
assertState(ROLLING_FORWARD, actions.build());
changeState(JOB, 1, FINISHED, ASSIGNED, STARTING, RUNNING);
clock.advance(Amount.of(WATCH_TIMEOUT.getValue() / 2, Time.MILLISECONDS));
changeState(JOB, 0, FINISHED, ASSIGNED, STARTING, RUNNING);
clock.advance(Amount.of(WATCH_TIMEOUT.getValue() / 2, Time.MILLISECONDS));
// Instance 1 finished first, but update does not yet proceed until 0 finishes.
actions.putAll(1, INSTANCE_UPDATED);
assertState(ROLLING_FORWARD, actions.build());
clock.advance(WATCH_TIMEOUT);
actions.putAll(0, INSTANCE_UPDATED);
// Instance 2 is updated.
changeState(JOB, 2, FINISHED, ASSIGNED, STARTING, RUNNING);
clock.advance(WATCH_TIMEOUT);
actions.putAll(2, INSTANCE_UPDATING, INSTANCE_UPDATED);
assertState(ROLLED_FORWARD, actions.build());
assertJobState(
JOB,
ImmutableMap.of(0, NEW_CONFIG, 1, NEW_CONFIG, 2, NEW_CONFIG));
try {
updater.rollback(UPDATE_ID, AUDIT);
fail();
} catch (UpdateStateException e) {
// Expected.
}
}
@Test
public void testRollbackDuringUpgrade() throws Exception {
expectTaskKilled().times(5);
control.replay();
JobUpdate builder = makeJobUpdate(makeInstanceConfig(0, 2, OLD_CONFIG)).newBuilder();
builder.getInstructions().getSettings()
.setWaitForBatchCompletion(true)
.setUpdateGroupSize(2);
IJobUpdate update = IJobUpdate.build(builder);
insertInitialTasks(update);
changeState(JOB, 0, ASSIGNED, STARTING, RUNNING);
changeState(JOB, 1, ASSIGNED, STARTING, RUNNING);
changeState(JOB, 2, ASSIGNED, STARTING, RUNNING);
clock.advance(WATCH_TIMEOUT);
ImmutableMultimap.Builder<Integer, JobUpdateAction> actions = ImmutableMultimap.builder();
// Instances 0 and 1 are updated.
updater.start(update, AUDIT);
actions.putAll(0, INSTANCE_UPDATING)
.putAll(1, INSTANCE_UPDATING);
assertState(ROLLING_FORWARD, actions.build());
changeState(JOB, 1, FINISHED, ASSIGNED, STARTING, RUNNING);
changeState(JOB, 0, FINISHED, ASSIGNED, STARTING, RUNNING);
clock.advance(WATCH_TIMEOUT);
actions.putAll(0, INSTANCE_UPDATED)
.putAll(1, INSTANCE_UPDATED)
.putAll(2, INSTANCE_UPDATING);
assertState(ROLLING_FORWARD, actions.build());
clock.advance(WATCH_TIMEOUT);
updater.rollback(UPDATE_ID, AUDIT);
actions.putAll(1, INSTANCE_ROLLING_BACK);
actions.putAll(2, INSTANCE_ROLLING_BACK);
changeState(JOB, 1, KILLED);
changeState(JOB, 2, KILLED);
clock.advance(WATCH_TIMEOUT);
assertState(ROLLING_BACK, actions.build());
clock.advance(WATCH_TIMEOUT);
changeState(JOB, 2, ASSIGNED, STARTING, RUNNING);
changeState(JOB, 1, ASSIGNED, STARTING, RUNNING);
clock.advance(WATCH_TIMEOUT);
actions.putAll(2, INSTANCE_ROLLED_BACK)
.putAll(1, INSTANCE_ROLLED_BACK);
changeState(JOB, 0, KILLED);
actions.putAll(0, INSTANCE_ROLLING_BACK);
clock.advance(WATCH_TIMEOUT);
assertState(ROLLING_BACK, actions.build());
changeState(JOB, 0, ASSIGNED, STARTING, RUNNING);
actions.putAll(0, INSTANCE_ROLLED_BACK);
clock.advance(WATCH_TIMEOUT);
assertState(ROLLED_BACK, actions.build());
assertJobState(
JOB,
ImmutableMap.of(0, OLD_CONFIG, 1, OLD_CONFIG, 2, OLD_CONFIG));
}
@Test
public void testRollbackCoordinatedUpdate() throws Exception {
control.replay();
JobUpdate builder = makeJobUpdate(
// No-op - task is already matching the new config.
makeInstanceConfig(0, 0, NEW_CONFIG),
// Tasks needing update.
makeInstanceConfig(1, 2, OLD_CONFIG)).newBuilder();
builder.getInstructions().getSettings().setBlockIfNoPulsesAfterMs((int) PULSE_TIMEOUT_MS);
insertInitialTasks(IJobUpdate.build(builder));
changeState(JOB, 0, ASSIGNED, STARTING, RUNNING);
changeState(JOB, 1, ASSIGNED, STARTING, RUNNING);
changeState(JOB, 2, ASSIGNED, STARTING, RUNNING);
clock.advance(WATCH_TIMEOUT);
ImmutableMultimap.Builder<Integer, JobUpdateAction> actions = ImmutableMultimap.builder();
updater.start(IJobUpdate.build(builder), AUDIT);
// The update is blocked initially waiting for a pulse.
assertState(ROLL_FORWARD_AWAITING_PULSE, actions.build());
updater.rollback(UPDATE_ID, AUDIT);
clock.advance(WATCH_TIMEOUT);
assertState(ROLLED_BACK, actions.build());
}
@Test
public void testRollbackPausedForwardUpdate() throws Exception {
expectTaskKilled().times(2);
control.replay();
JobUpdate builder = makeJobUpdate(
// No-op - task is already matching the new config.
makeInstanceConfig(0, 0, NEW_CONFIG),
// Tasks needing update.
makeInstanceConfig(1, 2, OLD_CONFIG)).newBuilder();
insertInitialTasks(IJobUpdate.build(builder));
changeState(JOB, 0, ASSIGNED, STARTING, RUNNING);
changeState(JOB, 1, ASSIGNED, STARTING, RUNNING);
changeState(JOB, 2, ASSIGNED, STARTING, RUNNING);
clock.advance(WATCH_TIMEOUT);
ImmutableMultimap.Builder<Integer, JobUpdateAction> actions = ImmutableMultimap.builder();
updater.start(IJobUpdate.build(builder), AUDIT);
actions.putAll(1, INSTANCE_UPDATING);
assertState(ROLLING_FORWARD, actions.build());
clock.advance(WATCH_TIMEOUT);
changeState(JOB, 1, KILLED, ASSIGNED, STARTING, RUNNING);
updater.pause(UPDATE_ID, AUDIT);
assertState(ROLL_FORWARD_PAUSED, actions.build());
updater.rollback(UPDATE_ID, AUDIT);
actions.putAll(1, INSTANCE_ROLLING_BACK);
clock.advance(WATCH_TIMEOUT);
assertState(ROLLING_BACK, actions.build());
actions.putAll(1, INSTANCE_ROLLED_BACK);
changeState(JOB, 1, KILLED, ASSIGNED, STARTING, RUNNING);
clock.advance(WATCH_TIMEOUT);
assertState(ROLLED_BACK, actions.build());
assertJobState(
JOB,
ImmutableMap.of(0, NEW_CONFIG, 1, OLD_CONFIG, 2, OLD_CONFIG));
}
@Test
public void testInProgressUpdate() throws Exception {
control.replay();
IJobUpdate inProgress = makeJobUpdate();
ILock lock = insertInProgressUpdate(inProgress);
IJobUpdate anotherUpdate = makeJobUpdate();
try {
updater.start(anotherUpdate, AUDIT);
fail("update cannot start when another is in-progress");
} catch (UpdateInProgressException e) {
// Expected.
assertEquals(
inProgress.getSummary().newBuilder().setState(new JobUpdateState(ROLLING_FORWARD, 0, 0)),
e.getInProgressUpdateSummary().newBuilder());
assertEquals(ImmutableList.of(lock), ImmutableList.copyOf(lockManager.getLocks()));
} finally {
lockManager.releaseLock(lock);
}
}
private static IJobUpdateSummary makeUpdateSummary(IJobUpdateKey key) {
return IJobUpdateSummary.build(new JobUpdateSummary()
.setUser("user")
.setKey(key.newBuilder()));
}
private static IJobUpdate makeJobUpdate(IInstanceTaskConfig... configs) {
JobUpdate builder = new JobUpdate()
.setSummary(makeUpdateSummary(UPDATE_ID).newBuilder().setMetadata(METADATA))
.setInstructions(new JobUpdateInstructions()
.setDesiredState(new InstanceTaskConfig()
.setTask(NEW_CONFIG.newBuilder())
.setInstances(ImmutableSet.of(new Range(0, 2))))
.setSettings(new JobUpdateSettings()
.setUpdateGroupSize(1)
.setRollbackOnFailure(true)
.setMinWaitInInstanceRunningMs(WATCH_TIMEOUT.as(Time.MILLISECONDS).intValue())
.setUpdateOnlyTheseInstances(ImmutableSet.of())));
for (IInstanceTaskConfig config : configs) {
builder.getInstructions().addToInitialState(config.newBuilder());
}
return IJobUpdate.build(builder);
}
private static IJobUpdate setInstanceCount(IJobUpdate update, int instanceCount) {
JobUpdate builder = update.newBuilder();
builder.getInstructions().getDesiredState().setInstances(
ImmutableSet.of(new Range(0, instanceCount - 1)));
return IJobUpdate.build(builder);
}
private static IInstanceTaskConfig makeInstanceConfig(int start, int end, ITaskConfig config) {
return IInstanceTaskConfig.build(new InstanceTaskConfig()
.setInstances(ImmutableSet.of(new Range(start, end)))
.setTask(config.newBuilder()));
}
}