/**
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.aurora.scheduler.cron.quartz;
import java.lang.annotation.Retention;
import java.lang.annotation.Target;
import java.util.Date;
import java.util.Set;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.atomic.AtomicLong;
import javax.inject.Inject;
import javax.inject.Qualifier;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Optional;
import com.google.common.collect.Iterables;
import com.google.common.collect.Sets;
import org.apache.aurora.common.stats.Stats;
import org.apache.aurora.common.stats.StatsProvider;
import org.apache.aurora.common.util.BackoffHelper;
import org.apache.aurora.gen.CronCollisionPolicy;
import org.apache.aurora.scheduler.BatchWorker;
import org.apache.aurora.scheduler.BatchWorker.NoResult;
import org.apache.aurora.scheduler.base.JobKeys;
import org.apache.aurora.scheduler.base.Query;
import org.apache.aurora.scheduler.base.Tasks;
import org.apache.aurora.scheduler.configuration.SanitizedConfiguration;
import org.apache.aurora.scheduler.cron.CronException;
import org.apache.aurora.scheduler.cron.SanitizedCronJob;
import org.apache.aurora.scheduler.events.PubsubEvent.EventSubscriber;
import org.apache.aurora.scheduler.state.StateManager;
import org.apache.aurora.scheduler.storage.Storage;
import org.apache.aurora.scheduler.storage.entities.IJobConfiguration;
import org.apache.aurora.scheduler.storage.entities.IJobKey;
import org.apache.aurora.scheduler.storage.entities.ITaskConfig;
import org.quartz.DisallowConcurrentExecution;
import org.quartz.Job;
import org.quartz.JobExecutionContext;
import org.quartz.JobExecutionException;
import org.quartz.PersistJobDataAfterExecution;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static java.lang.annotation.ElementType.FIELD;
import static java.lang.annotation.ElementType.METHOD;
import static java.lang.annotation.ElementType.PARAMETER;
import static java.lang.annotation.RetentionPolicy.RUNTIME;
import static java.util.Objects.requireNonNull;
import static com.google.common.base.Preconditions.checkState;
import static org.apache.aurora.gen.ScheduleStatus.KILLING;
/**
* Encapsulates the logic behind a single trigger of a single job key. Multiple executions may run
* concurrently but only a single instance will be active at a time per job key.
*
* <p>
* Executions may block for long periods of time when waiting for a kill to complete. The Quartz
* scheduler should therefore be configured with a large number of threads.
*/
@DisallowConcurrentExecution
@PersistJobDataAfterExecution
class AuroraCronJob implements Job, EventSubscriber {
private static final Logger LOG = LoggerFactory.getLogger(AuroraCronJob.class);
private static final AtomicLong CRON_JOB_TRIGGERS = Stats.exportLong("cron_job_triggers");
private static final AtomicLong CRON_JOB_MISFIRES = Stats.exportLong("cron_job_misfires");
private static final AtomicLong CRON_JOB_PARSE_FAILURES =
Stats.exportLong("cron_job_parse_failures");
private static final AtomicLong CRON_JOB_COLLISIONS = Stats.exportLong("cron_job_collisions");
private static final AtomicLong CRON_JOB_CONCURRENT_RUNS =
Stats.exportLong("cron_job_concurrent_runs");
@VisibleForTesting
static final Optional<String> KILL_AUDIT_MESSAGE = Optional.of("Killed by cronScheduler");
private final StateManager stateManager;
private final BackoffHelper delayedStartBackoff;
private final BatchWorker<NoResult> batchWorker;
private final Set<IJobKey> killFollowups = Sets.newConcurrentHashSet();
/**
* Annotation for the max cron batch size.
*/
@VisibleForTesting
@Qualifier
@Target({ FIELD, PARAMETER, METHOD }) @Retention(RUNTIME)
@interface CronMaxBatchSize { }
static class CronBatchWorker extends BatchWorker<NoResult> {
@Inject
CronBatchWorker(
Storage storage,
StatsProvider statsProvider,
@CronMaxBatchSize int maxBatchSize) {
super(storage, statsProvider, maxBatchSize);
}
@Override
protected String serviceName() {
return "CronBatchWorker";
}
}
@Inject
AuroraCronJob(
Config config,
StateManager stateManager,
CronBatchWorker batchWorker) {
this.stateManager = requireNonNull(stateManager);
this.batchWorker = requireNonNull(batchWorker);
this.delayedStartBackoff = requireNonNull(config.getDelayedStartBackoff());
}
@Override
public void execute(JobExecutionContext context) throws JobExecutionException {
// We assume quartz prevents concurrent runs of this job for a given job key. This allows us
// to avoid races where we might kill another run's tasks.
checkState(context.getJobDetail().isConcurrentExectionDisallowed());
doExecute(context);
}
@VisibleForTesting
void doExecute(JobExecutionContext context) throws JobExecutionException {
final IJobKey key = Quartz.auroraJobKey(context.getJobDetail().getKey());
final String path = JobKeys.canonicalString(key);
// Prevent a concurrent run for this job in case a previous trigger took longer to run.
// This approach relies on saving the "work in progress" token within the job context itself
// (see below) and relying on killFollowups to signal "work completion".
if (context.getJobDetail().getJobDataMap().containsKey(path)) {
CRON_JOB_CONCURRENT_RUNS.incrementAndGet();
if (killFollowups.contains(key)) {
context.getJobDetail().getJobDataMap().remove(path);
killFollowups.remove(key);
LOG.info("Resetting job context for cron {}", path);
} else {
LOG.info("Ignoring trigger as another concurrent run is active for cron {}", path);
return;
}
}
CompletableFuture<NoResult> scheduleResult = batchWorker.<NoResult>execute(storeProvider -> {
Optional<IJobConfiguration> config = storeProvider.getCronJobStore().fetchJob(key);
if (!config.isPresent()) {
LOG.warn("Cron was triggered for {} but no job with that key was found in storage.", path);
CRON_JOB_MISFIRES.incrementAndGet();
return BatchWorker.NO_RESULT;
}
SanitizedCronJob cronJob;
try {
cronJob = SanitizedCronJob.from(new SanitizedConfiguration(config.get()));
} catch (CronException e) {
LOG.warn("Invalid cron job for {} in storage - failed to parse", key, e);
CRON_JOB_PARSE_FAILURES.incrementAndGet();
return BatchWorker.NO_RESULT;
}
CronCollisionPolicy collisionPolicy = cronJob.getCronCollisionPolicy();
LOG.info("Cron triggered for {} at {} with policy {}", path, new Date(), collisionPolicy);
CRON_JOB_TRIGGERS.incrementAndGet();
final Query.Builder activeQuery = Query.jobScoped(key).active();
Set<String> activeTasks = Tasks.ids(storeProvider.getTaskStore().fetchTasks(activeQuery));
ITaskConfig task = cronJob.getSanitizedConfig().getJobConfig().getTaskConfig();
Set<Integer> instanceIds = cronJob.getSanitizedConfig().getInstanceIds();
if (activeTasks.isEmpty()) {
stateManager.insertPendingTasks(storeProvider, task, instanceIds);
return BatchWorker.NO_RESULT;
}
CRON_JOB_COLLISIONS.incrementAndGet();
switch (collisionPolicy) {
case KILL_EXISTING:
for (String taskId : activeTasks) {
stateManager.changeState(
storeProvider,
taskId,
Optional.absent(),
KILLING,
KILL_AUDIT_MESSAGE);
}
LOG.info("Waiting for job to terminate before launching cron job " + path);
// Use job detail map to signal a "work in progress" condition to subsequent triggers.
context.getJobDetail().getJobDataMap().put(path, null);
batchWorker.executeWithReplay(
delayedStartBackoff.getBackoffStrategy(),
store -> {
Query.Builder query = Query.taskScoped(activeTasks).active();
if (Iterables.isEmpty(storeProvider.getTaskStore().fetchTasks(query))) {
LOG.info("Initiating delayed launch of cron " + path);
stateManager.insertPendingTasks(store, task, instanceIds);
return new BatchWorker.Result<>(true, null);
} else {
LOG.info("Not yet safe to run cron " + path);
return new BatchWorker.Result<>(false, null);
}
})
.thenAccept(ignored -> {
killFollowups.add(key);
LOG.info("Finished delayed launch for cron " + path);
});
break;
case RUN_OVERLAP:
LOG.error("Ignoring trigger for job {} with deprecated collision"
+ "policy RUN_OVERLAP due to unterminated active tasks.", path);
break;
case CANCEL_NEW:
break;
default:
LOG.error("Unrecognized cron collision policy: " + collisionPolicy);
}
return BatchWorker.NO_RESULT;
});
try {
scheduleResult.get();
} catch (ExecutionException | InterruptedException e) {
LOG.warn("Interrupted while trying to launch cron " + path, e);
Thread.currentThread().interrupt();
throw new JobExecutionException(e);
}
}
static class Config {
private final BackoffHelper delayedStartBackoff;
Config(BackoffHelper delayedStartBackoff) {
this.delayedStartBackoff = requireNonNull(delayedStartBackoff);
}
public BackoffHelper getDelayedStartBackoff() {
return delayedStartBackoff;
}
}
}