/*******************************************************************************
* Copyright 2013 Michael Marconi
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
* in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing permissions and limitations under
* the License.
******************************************************************************/
package oncue.scheduler;
import static java.lang.String.format;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import oncue.backingstore.BackingStore;
import oncue.common.comparators.JobComparator;
import oncue.common.events.AgentStartedEvent;
import oncue.common.events.AgentStoppedEvent;
import oncue.common.events.JobCleanupEvent;
import oncue.common.events.JobEnqueuedEvent;
import oncue.common.events.JobFailedEvent;
import oncue.common.events.JobProgressEvent;
import oncue.common.exceptions.DeleteJobException;
import oncue.common.messages.AbstractWorkRequest;
import oncue.common.messages.Agent;
import oncue.common.messages.AgentSummary;
import oncue.common.messages.CleanupJobs;
import oncue.common.messages.DeleteJob;
import oncue.common.messages.EnqueueJob;
import oncue.common.messages.Job;
import oncue.common.messages.Job.State;
import oncue.common.messages.JobFailed;
import oncue.common.messages.JobProgress;
import oncue.common.messages.JobSummary;
import oncue.common.messages.RerunJob;
import oncue.common.messages.SimpleMessages.SimpleMessage;
import oncue.common.messages.WorkAvailable;
import oncue.common.messages.WorkResponse;
import oncue.common.settings.Settings;
import oncue.common.settings.SettingsProvider;
import oncue.scheduler.exceptions.JobNotFoundException;
import oncue.scheduler.exceptions.ScheduleException;
import scala.concurrent.duration.Deadline;
import akka.actor.ActorInitializationException;
import akka.actor.ActorRef;
import akka.actor.ActorSystem;
import akka.actor.Cancellable;
import akka.actor.Status.Failure;
import akka.actor.Status.Success;
import akka.actor.UntypedActor;
import akka.event.Logging;
import akka.event.LoggingAdapter;
import akka.remote.RemoteClientShutdown;
/**
* A scheduler is responsible for keeping a list of registered agents, broadcasting new work to them
* when it arrives and distributing the work using a variety of scheduling algorithms, depending on
* the concrete implementation.
*/
public abstract class AbstractScheduler<WorkRequest extends AbstractWorkRequest> extends
UntypedActor {
// A periodic check for dead agents
private Cancellable agentMonitor;
// Map an agent to a deadline for deregistration
private Map<String, Deadline> agents = new HashMap<>();
// Map an agent to a the set of worker types it can process
private Map<String, Set<String>> agentWorkers = new HashMap<>();
// The persistent backing store
protected BackingStore backingStore;
// A scheduled check for jobs to broadcast
private Cancellable jobsBroadcast;
protected LoggingAdapter log = Logging.getLogger(getContext().system(), this);
// A flag to indicate that jobs should not be scheduled temporarily
private boolean paused = false;
// The map of scheduled jobs
private ScheduledJobs scheduledJobs;
protected Settings settings = SettingsProvider.SettingsProvider.get(getContext().system());
// A probe for testing
private ActorRef testProbe;
// The queue of unscheduled jobs
protected UnscheduledJobs unscheduledJobs;
public List<Job> getScheduledJobs() {
return scheduledJobs.getScheduledJobs();
}
public int getUnscheduledJobsCount() {
return unscheduledJobs.getSize();
}
/**
* @param backingStore is an implementation of {@linkplain BackingStore}
*/
public AbstractScheduler(Class<? extends BackingStore> backingStore) {
if (backingStore == null)
throw new RuntimeException("A backing store implementation must be specified!");
try {
this.backingStore = backingStore.getConstructor(ActorSystem.class, Settings.class)
.newInstance(getContext().system(), settings);
unscheduledJobs = new UnscheduledJobs(this.backingStore, log, getComparator());
scheduledJobs = new ScheduledJobs(this.backingStore);
log.info("{} is running, backed by {}", getClass().getSimpleName(),
backingStore.getSimpleName());
} catch (InstantiationException | IllegalAccessException | IllegalArgumentException
| InvocationTargetException | NoSuchMethodException | SecurityException e) {
throw new ActorInitializationException(getSelf(),
"Failed to create a backing store from class: " + backingStore.getName(), e);
}
}
protected Comparator<Job> getComparator() {
return new JobComparator();
}
/**
* While there are jobs in the queue, continue sending a "Work available" message to all
* registered agents.
*/
private void broadcastJobs() {
/*
* Don't broadcast jobs if there are no agents, no more jobs on the unscheduled queue or
* scheduling is paused
*/
if (agents.isEmpty() || unscheduledJobs.isEmpty() || paused)
return;
log.debug("Broadcasting jobs");
for (String agent : agents.keySet()) {
if (testProbe != null)
testProbe.tell(createWorkAvailable(), getSelf());
getContext().actorFor(agent).tell(createWorkAvailable(), getSelf());
}
// Tee-up another broadcast if necessary
if (!unscheduledJobs.isEmpty()) {
// Cancel any scheduled broadcast
if (jobsBroadcast != null)
jobsBroadcast.cancel();
jobsBroadcast = getContext().system().scheduler()
.scheduleOnce(settings.SCHEDULER_BROADCAST_JOBS_FREQUENCY, new Runnable() {
@Override
public void run() {
getSelf().tell(SimpleMessage.BROADCAST_JOBS, getSelf());
}
}, getContext().dispatcher());
}
}
/**
* Check to see that each agent has sent a heart beat by the deadline.
*/
private void checkAgents() {
for (String agent : agents.keySet()) {
Deadline deadline = agents.get(agent);
if (deadline.isOverdue()) {
log.error("Found a dead agent: '{}'", agent);
if (testProbe != null)
testProbe.tell(SimpleMessage.AGENT_DEAD, getSelf());
deregisterAgent(agent);
rebroadcastJobs(agent);
}
}
}
/**
* When a job is finished or has failed, it must be removed from the scheduler's records.
*
* @param job is the {@linkplain Job} to clean up after
*/
private void cleanupJob(Job job, String agent) {
log.debug("Cleaning up {} for agent {}", job, agent);
scheduledJobs.removeJobById(job.getId(), agent);
}
/**
* Construct a message to advertise the type of work available.
*/
private WorkAvailable createWorkAvailable() {
return new WorkAvailable(unscheduledJobs.getWorkerTypes());
}
/**
* Delete an existing job
*
* @param job is the job to delete
* @return the deleted job
* @throws DeleteJobException if the job is currently running
*/
private Job deleteJob(Job job) throws DeleteJobException {
switch (job.getState()) {
case RUNNING:
throw new DeleteJobException("This job cannot be deleted as it is currently running");
case QUEUED:
boolean removed = unscheduledJobs.removeJobById(job.getId());
if (!removed)
throw new DeleteJobException(
"Failed to remove the job from the unscheduled jobs queue");
break;
case COMPLETE:
backingStore.removeCompletedJobById(job.getId());
break;
case FAILED:
backingStore.removeFailedJobById(job.getId());
break;
default:
throw new DeleteJobException(job.getState().toString()
+ " is an unrecognised job state");
}
job.setState(State.DELETED);
return job;
}
/**
* De-register an agent
*/
private void deregisterAgent(String url) {
agents.remove(url);
agentWorkers.remove(url);
// Stop listening to remote events
getContext().system().eventStream().unsubscribe(getContext().actorFor(url));
// Broadcast agent stopped event
Agent agent = new Agent(url);
getContext().system().eventStream().publish(new AgentStoppedEvent(agent));
}
/**
* Dispatch jobs to agents according to entries in the schedule. This method will also keep
* record of the jobs scheduled to each agent, in case an agent dies.
*
* @param schedule is the {@linkplain Schedule} that maps agents to jobs
*/
protected void dispatchJobs(Schedule schedule) {
validateSchedule(schedule);
for (Map.Entry<String, WorkResponse> entry : schedule.getEntries()) {
ActorRef agent = getContext().actorFor(entry.getKey());
WorkResponse workResponse = entry.getValue();
// Assign the jobs to the agent
unscheduledJobs.removeJobs(workResponse.getJobs());
scheduledJobs.addJobs(agent.path().toString(), workResponse.getJobs());
log.debug("Sending work response with {} jobs to agent {}", workResponse.getJobs()
.size(), agent.toString());
// Tell the agent about the work
agent.tell(workResponse, getSelf());
}
}
/**
* Enqueue a new job
*/
private Job enqueueJob(EnqueueJob enqueueJob) {
Job job = new Job(backingStore.getNextJobID(), enqueueJob.getWorkerType());
Map<String, String> jobParams = enqueueJob.getParams();
if (jobParams != null) {
job.getParams().putAll(jobParams);
}
augmentJob(job);
unscheduledJobs.addJob(job);
getContext().system().eventStream().publish(new JobEnqueuedEvent(job));
startJobsBroadcast();
return job;
}
/**
* This method can be overridden by a {@linkplain AbstractScheduler} implementation in order to
* modify the Job object before it is persisted in the list of unscheduled jobs. This allows
* scheduler-implementation-specific metadata to be attached to the job.
*
* @param job The job to modify
*/
protected void augmentJob(Job job) {
// NOOP
};
/**
* Look through all jobs to find an existing job
*
* @param id is the unique job identifier
* @return the matching job
* @throws JobNotFoundException
*/
private Job findExistingJob(long id) throws JobNotFoundException {
Set<Job> jobs = getAllJobs();
for (Job job : jobs) {
if (job.getId() == id)
return job;
}
throw new JobNotFoundException("Failed to find an existing job with ID " + id);
}
/**
* @return the set of all registered agents
*/
protected Set<String> getAgents() {
return agents.keySet();
}
/**
* @return the map of agents to the worker types they can process
*/
protected Map<String, Set<String>> getAgentWorkers() {
return agentWorkers;
}
/**
* @return the full set of unscheduled, scheduled, complete and failed jobs
*/
private Set<Job> getAllJobs() {
Set<Job> jobs = new HashSet<>();
for (Iterator<Job> iterator = unscheduledJobs.iterator(); iterator.hasNext();) {
jobs.add(iterator.next());
}
jobs.addAll(scheduledJobs.getJobs());
jobs.addAll(backingStore.getCompletedJobs());
jobs.addAll(backingStore.getFailedJobs());
return jobs;
}
/**
* Record the details of a failed job
*
* @param jobFailed contains both the failed job and the cause of failure
*/
private void handleJobFailure(Job job, String agent) {
if (backingStore != null)
backingStore.persistJobFailure(job);
cleanupJob(job, agent);
getContext().system().eventStream().publish(new JobFailedEvent(job));
}
/**
* Record any progress made against a job. If the job is complete, remove it from the jobs
* scheduled against an agent.
*
* @param jobProgress describes the job and associated progress.
*/
private void handleJobProgress(Job job, String agent) {
if (backingStore != null)
backingStore.persistJobProgress(job);
if (job.getProgress() == 1.0) {
log.debug("{} is complete.", job);
cleanupJob(job, agent);
} else if (job.getState() != State.QUEUED)
scheduledJobs.updateJob(job, agent);
getContext().system().eventStream().publish(new JobProgressEvent(job));
}
/**
* Inject a probe into this actor for testing
*
* @param testProbe is a JavaTestKit probe
*/
public void injectProbe(ActorRef testProbe) {
this.testProbe = testProbe;
}
/**
* Set up a monitor that periodically checks for dead Agents
*/
private void monitorAgents() {
agentMonitor = getContext()
.system()
.scheduler()
.schedule(settings.SCHEDULER_MONITOR_AGENTS_FREQUENCY,
settings.SCHEDULER_MONITOR_AGENTS_FREQUENCY, new Runnable() {
@Override
public void run() {
getSelf().tell(SimpleMessage.CHECK_AGENTS, getSelf());
}
}, getContext().dispatcher());
}
@SuppressWarnings("unchecked")
@Override
public void onReceive(Object message) throws Exception {
if (testProbe != null)
testProbe.forward(message, getContext());
if (message.equals(SimpleMessage.AGENT_HEARTBEAT)) {
log.debug("Got a heartbeat from agent: '{}'", getSender());
registerAgent(getSender().path().toString());
}
else if (message instanceof RemoteClientShutdown) {
String system = ((RemoteClientShutdown) message).getRemoteAddress().system();
if ("oncue-agent".equals(system)) {
String agent = ((RemoteClientShutdown) message).getRemoteAddress().toString()
+ settings.AGENT_PATH;
log.info("Agent '{}' has shut down", agent);
deregisterAgent(agent);
rebroadcastJobs(agent);
if (testProbe != null)
testProbe.tell(SimpleMessage.AGENT_SHUTDOWN, getSelf());
}
}
else if (message.equals(SimpleMessage.CHECK_AGENTS)) {
log.debug("Checking for dead agents...");
checkAgents();
}
else if (message instanceof EnqueueJob) {
log.debug("Got a new job to enqueue: {}", message);
Job job = enqueueJob((EnqueueJob) message);
getSender().tell(job, getSelf());
}
else if (message instanceof RerunJob) {
log.debug("Got an existing job to re-run: {}", message);
Job job = findExistingJob(((RerunJob) message).getId());
Job rerunJob = rerunJob(job);
getSender().tell(rerunJob, getSelf());
}
else if (message instanceof DeleteJob) {
log.debug("Got an existing job to delete: {}", message);
Job job = findExistingJob(((DeleteJob) message).getId());
Job deleteJob;
try {
deleteJob = deleteJob(job);
getSender().tell(deleteJob, getSelf());
} catch (DeleteJobException e) {
log.error(e, "Failed to delete job {}", job.getId());
getSender().tell(new Failure(e), getSelf());
}
}
else if (message instanceof CleanupJobs) {
log.debug("Clean up jobs");
CleanupJobs cleanupJobs = (CleanupJobs) message;
int numCleanedJobs = backingStore.cleanupJobs(cleanupJobs.isIncludeFailedJobs(),
cleanupJobs.getExpirationAge());
getContext().system().eventStream().publish(new JobCleanupEvent());
getSender().tell(new Success(format("Removed %d jobs", numCleanedJobs)), getSelf());
}
else if (message instanceof AbstractWorkRequest) {
log.debug("Got a work request from agent '{}': {}", getSender().path().toString(),
message);
AbstractWorkRequest workRequest = (AbstractWorkRequest) message;
agentWorkers.put(getSender().path().toString(), workRequest.getWorkerTypes());
boolean workAvailable = unscheduledJobs.isWorkAvailable(workRequest.getWorkerTypes());
if (!workAvailable || paused)
replyWithNoWork(getSender());
else {
scheduleJobs((WorkRequest) workRequest);
}
}
else if (message instanceof JobProgress) {
Job job = ((JobProgress) message).getJob();
log.debug("Agent reported progress of {} on {}", job.getProgress(), job);
handleJobProgress(job, getSender().path().toString());
}
else if (message instanceof JobFailed) {
Job job = ((JobFailed) message).getJob();
log.debug("Agent reported a failed job {} ({})", job, job.getErrorMessage());
handleJobFailure(job, getSender().path().toString());
}
else if (message == SimpleMessage.JOB_SUMMARY) {
log.debug("Received a request for a job summary from {}", getSender());
replyWithJobSummary();
}
else if (message == SimpleMessage.LIST_AGENTS) {
log.debug("Received a request for a the list of registered agents from {}", getSender());
replyWithAgentSummary();
}
else if (message.equals(SimpleMessage.BROADCAST_JOBS)) {
log.debug("Teeing up a job broadcast...");
broadcastJobs();
}
else {
log.error("Unrecognised message: {}", message);
unhandled(message);
}
}
/**
* Pause job scheduling temporarily
*/
public void pause() {
paused = true;
}
@Override
public void postStop() {
super.postStop();
if (agentMonitor != null)
agentMonitor.cancel();
if (jobsBroadcast != null)
jobsBroadcast.cancel();
log.info("Shut down.");
}
@Override
public void preStart() {
monitorAgents();
super.preStart();
}
/**
* In the case where an Agent has died or shutdown before completing the jobs assigned to it, we
* need to re-broadcast the jobs so they are run by another agent.
*
* @param agent is the Agent to check for incomplete jobs
*/
private void rebroadcastJobs(String agent) {
if (!scheduledJobs.getJobs(agent).isEmpty()) {
// Grab the list of jobs scheduled for this agent
List<Job> agentJobs = new ArrayList<>();
for (Job scheduledJob : scheduledJobs.getJobs(agent)) {
agentJobs.add(scheduledJob);
}
for (Job job : agentJobs) {
// Remove job from the agent
scheduledJobs.removeJobById(job.getId(), agent);
// Reset job state and progress
job.setState(State.QUEUED);
job.setProgress(0);
handleJobProgress(job, agent);
// Add jobs back onto the unscheduled queue
unscheduledJobs.addJob(job);
}
}
broadcastJobs();
}
/**
* Register the heartbeat of an agent, capturing the heartbeat time as a timestamp. If this is a
* new Agent, return a message indicating that it has been registered.
*
* @param agent is the agent to register
*/
private void registerAgent(String url) {
if (!agents.containsKey(url)) {
Agent agent = new Agent(url);
getContext().actorFor(url).tell(SimpleMessage.AGENT_REGISTERED, getSelf());
getContext().system().eventStream().subscribe(getSelf(), RemoteClientShutdown.class);
getContext().system().eventStream().publish(new AgentStartedEvent(agent));
log.info("Registered agent: {}", url);
}
agents.put(url, settings.SCHEDULER_AGENT_HEARTBEAT_TIMEOUT.fromNow());
}
/**
* Reply with the list of registered agents
*/
private void replyWithAgentSummary() {
List<Agent> agents = new ArrayList<>();
for (String url : this.agents.keySet()) {
Agent agent = new oncue.common.messages.Agent(url);
agents.add(agent);
}
getSender().tell(new AgentSummary(agents), getSelf());
}
/**
* Construct and reply with a job summary message
*/
private void replyWithJobSummary() {
getSender().tell(new JobSummary(getAllJobs()), getSelf());
}
/**
* Send a response to the requesting agent containing a {@linkplain WorkResponse} with no jobs.
*/
private void replyWithNoWork(ActorRef agent) {
agent.tell(new WorkResponse(), getSelf());
}
/**
* Re-run an existing job
*
* @param job is the job to re-run
*/
private Job rerunJob(Job job) {
Job rerunJob = new Job(job.getId(), job.getWorkerType());
rerunJob.setParams(job.getParams());
rerunJob.setRerun(true);
// TODO Find a way to make this transactional
unscheduledJobs.addJob(rerunJob);
if (job.getState() == Job.State.COMPLETE)
backingStore.removeCompletedJobById(job.getId());
else if (job.getState() == Job.State.FAILED)
backingStore.removeFailedJobById(job.getId());
getContext().system().eventStream().publish(new JobEnqueuedEvent(job));
startJobsBroadcast();
return rerunJob;
}
/**
* Create a schedule that maps agents to work responses. Once the schedule has been created, the
* work should be dispatched by calling the <i>dispatchJobs</i> method.
*/
protected abstract void scheduleJobs(WorkRequest workRequest);
/**
* Schedule a jobs broadcast. Cancel any previously scheduled broadcast, to ensure quiescence in
* the case where lots of new jobs arrive in a short time.
*/
private void startJobsBroadcast() {
if (jobsBroadcast != null && !jobsBroadcast.isCancelled())
jobsBroadcast.cancel();
jobsBroadcast = getContext().system().scheduler()
.scheduleOnce(settings.SCHEDULER_BROADCAST_JOBS_QUIESCENCE_PERIOD, new Runnable() {
@Override
public void run() {
getSelf().tell(SimpleMessage.BROADCAST_JOBS, getSelf());
}
}, getContext().dispatcher());
}
/**
* Allow the scheduler to continue scheduling jobs.
*/
public void unpause() {
paused = false;
}
/**
* Ensure that the schedule produced by the scheduler is valid, e.g. ensure that no agent is
* scheduled a job it does not have the worker to process.
*
* @param schedule is the {@linkplain Schedule} to validate
*/
private void validateSchedule(Schedule schedule) {
for (Map.Entry<String, WorkResponse> entry : schedule.getEntries()) {
String agent = entry.getKey();
WorkResponse workResponse = entry.getValue();
for (Job job : workResponse.getJobs()) {
boolean foundWorkerType = false;
for (String workerType : agentWorkers.get(agent)) {
if (job.getWorkerType().equals(workerType)) {
foundWorkerType = true;
break;
}
}
if (!foundWorkerType)
throw new ScheduleException("Agent " + agent + " was assigned "
+ job.toString()
+ ", but does not have a worker capable of processing it!");
}
}
}
}