package automately.core.services.job;
import automately.core.data.Job;
import automately.core.data.Meta;
import automately.core.data.User;
import automately.core.data.UserData;
import automately.core.data.comparators.JobComparator;
import automately.core.data.predicates.JsonQueryPredicate;
import automately.core.file.VirtualFile;
import automately.core.file.VirtualFileSystem;
import automately.core.services.core.AutomatelyService;
import automately.core.services.job.execution.factories.hello.HelloWorldContextFactory;
import automately.core.services.job.execution.factories.js.NativeJSContextFactory;
import automately.core.services.job.execution.factories.v8.V8ContextFactory;
import com.hazelcast.core.*;
import com.hazelcast.nio.Address;
import com.hazelcast.query.*;
import io.jsync.Async;
import io.jsync.Handler;
import io.jsync.app.core.Cluster;
import io.jsync.app.core.Config;
import io.jsync.app.core.Logger;
import io.jsync.buffer.Buffer;
import io.jsync.eventbus.EventBus;
import io.jsync.eventbus.Message;
import io.jsync.http.HttpClient;
import io.jsync.impl.Windows;
import io.jsync.json.JsonArray;
import io.jsync.json.JsonObject;
import io.jsync.json.impl.Base64;
import java.io.*;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.*;
import java.util.concurrent.*;
/**
* JobServer handles all jobs. This is used to submit jobs to the cluster
* for execution.
*/
public class JobServer extends AutomatelyService {
// TODO Definitely complete javadocs
private static boolean initialized = false;
private static Cluster cluster = null;
private static IMap<String, Job> jobs = null;
private static ISet<String> jobsBeingExecuted = null;
private static IMap<String, String> jobExecutionNodes = null;
private static IMap<String, JsonObject> registeredJobServers = null;
private static String defaultExecutionFactory = NativeJSContextFactory.class.getCanonicalName();
@Deprecated
public static void setScriptContextFactory(String factory) {
setDefaultExecutionFactory(factory);
}
@Deprecated
public static String getScriptContextFactory() {
return getDefaultExecutionFactory();
}
public static void setDefaultExecutionFactory(String factory) {
defaultExecutionFactory = factory;
}
public static String getDefaultExecutionFactory() {
return defaultExecutionFactory;
}
public static boolean initialized(){
return initialized;
}
private static void checkInitialized(){
if(!initialized){
throw new RuntimeException("The JobServer has not been initialized yet!");
}
}
/**
* isStale is used to check if an Automately Job is stale in the cluster. It will
* return false if the job is not stale. When a job is stale it basically means that
*
* @param job the Job you wish to check
* @return returns true if the job is stale
*/
public static boolean isStale(Job job) {
checkInitialized();
if (job == null) {
throw new NullPointerException();
}
ICountDownLatch globalJobFinishLatch = cluster.hazelcast().getCountDownLatch(job.token() + "_job_finish_latch");
String status = job.status;
// Return false since the job pretty much has already been handled
if (status.equals("complete") || status.equals("halted") || status.equals("stopping") || status.equals("timeout")) {
return false;
}
final boolean[] isStale = {false};
// Newest method for checking for stale jobs
// TWe are checking to see if there is a lock on the job within the cluster
// If the job is not being executed then there will be no lock so if it's locked
// The job is not stale
ILock executionLock = cluster.hazelcast().getLock("_job_lock_execution_" + job.token());
if(!executionLock.isLocked()){
isStale[0] = true;
}
// If the job is running, queued, processing we should definitely check if it is stale
if (status.equals("running") || status.equals("queued") || status.equals("processing")) {
// This means the job is being executed and a node is handling it.
// So we should check if the node still exists
if(jobsBeingExecuted.contains(job.token()) && jobExecutionNodes.containsKey(job.token())){
String nodeId = jobExecutionNodes.get(job.token());
// This means it was removed the server was removed. So the job is stale
if(!registeredJobServers.containsKey(nodeId)){
isStale[0] = true;
}
} else if (!jobsBeingExecuted.contains(job.token())) {
isStale[0] = true;
}
}
if(isStale[0]){
job.status = "complete";
JsonObject error = new JsonObject();
error.putString("message", "The job has went stale. It is no longer being executed.");
error.putString("code", "Stale Job");
job.results.putObject("error", error);
job.results.putBoolean("success", false);
jobs.set(job.token(), job);
if (globalJobFinishLatch.trySetCount(1)) {
while (globalJobFinishLatch.getCount() > 0){
globalJobFinishLatch.countDown();
}
}
jobsBeingExecuted.remove(job.token());
jobExecutionNodes.remove(job.token());
}
return isStale[0];
}
/**
* getJob allows you to retrieve a User's Job from the Cluster via
* it's token.
*
* @param user the User you wish to retrieve the Job from
* @param jobToken the token for the Job you are attempting to find
* @return returns the Job if it was found or null if it wasn't
*/
public static Job getJob(User user, String jobToken) {
checkInitialized();
EntryObject e = new PredicateBuilder().getEntryObject();
Predicate predicate = e.get("userToken").equal(user.token())
.and(e.get("token").equal(jobToken));
for (Job job : jobs.values(predicate)) {
if (job.token().equals(jobToken)) {
return job;
}
}
return null;
}
/**
* getJobs retrieves a Collection<Job> for every single Job belonging
* to the specified User.
*
* @param user the User you wish to retrieve the Job's for
* @return returns a Colllection<Job> for the User
*/
public static Collection<Job> getJobs(User user) {
checkInitialized();
EntryObject e = new PredicateBuilder().getEntryObject();
Predicate predicate = e.get("userToken").equal(user.token());
return jobs.values(predicate);
}
/**
* getJobs retrieves a Collection<Job> for every single Job belonging
* to the specified User. This returns 10 results by default.
*
* @param user the User you wish to retrieve the Job's for
* @param page the page index starting from 0 you are looking for
* @return returns a Colllection<Job> for the User
*/
public static Collection<Job> getJobs(User user, int page) {
return getJobs(user, page, 10);
}
/**
* getJobs retrieves a Collection<Job> for every single Job belonging
* to the specified User.
*
* @param user the User you wish to retrieve the Job's for
* @param page the page index starting from 0 you are looking for
* @param count the number of results to return
* @return returns a Colllection<Job> for the User
*/
public static Collection<Job> getJobs(User user, int page, int count) {
checkInitialized();
EntryObject e = new PredicateBuilder().getEntryObject();
com.hazelcast.query.Predicate userJobsPredicate = e.get("userToken").equal(user.token());
if (page < 0) {
page = 0;
}
if (count < 0) {
count = 10;
}
// Max Count is always 100
if (count > 100) count = 100;
// This predicate uses the previous one.. and then sorts the posts by date...
// IMPORTANT apparently can't lambda
PagingPredicate pagingPredicate = new PagingPredicate(userJobsPredicate, new JobComparator(), count);
Collection<Job> values = jobs.values(pagingPredicate);
if (count > pagingPredicate.getPage()) {
while (page > pagingPredicate.getPage()) {
pagingPredicate.nextPage();
}
values = jobs.values(pagingPredicate);
}
return values;
}
/**
* getRunningJobs will return a Collection<Job> containing all
* of the "normal" running jobs for the user.
*
* @param user the User you wish to retrieve the Collection<Job> for
* @return the Collection<Job> you are retrieving
*/
public static Collection<Job> getRunningJobs(User user) {
checkInitialized();
EntryObject e = new PredicateBuilder().getEntryObject();
Predicate predicate = e.get("userToken").equal(user.token())
.and(e.get("service").equal(false))
.and(e.get("status").equal("running"));
// We only return running service jobs
return jobs.values(predicate);
}
/**
* getRunningServices will return a Collection<Job> containing all
* of the running service jobs for the user.
*
* @param user the User you wish to retrieve the Collection<Job> for
* @return the Collection<Job> you are retrieving
*/
public static Collection<Job> getRunningServices(User user) {
checkInitialized();
EntryObject e = new PredicateBuilder().getEntryObject();
Predicate predicate = e.get("userToken").equal(user.token())
.and(e.get("service").equal(true))
.and(e.get("status").equal("running"));
return jobs.values(predicate);
}
/**
* getService is used to retrieve a service Job via it's serviceName.
*
* @param user the User you wish to retrieve the Job for
* @param serviceName the serviceName for the Job you wish to retrieve
* @return the Job you wish to retrieve or null if it doesn't exist
*/
public static Job getService(User user, String serviceName) {
checkInitialized();
EntryObject e = new PredicateBuilder().getEntryObject();
Predicate predicate = e.get("userToken").equal(user.token())
.and(e.get("service").equal(true))
.and(e.get("status").equal("running"))
.and(e.get("serviceName").equal(serviceName));
// We only want to get running services
Collection<Job> values = jobs.values(predicate);
if (values.iterator().hasNext()) {
return values.iterator().next();
}
return null;
}
/**
* publishJobEvent is a tool used to publish events on the internal JobServer such as
* halt, error, etc.
*
* @param job the Job you are publishing the event for
* @param event the event you are publishing
*/
public static void publishEvent(Job job, String event) {
checkInitialized();
String jobEventIdentifier = "job.server." + job.token() + ".events";
EventBus eventBus = cluster.eventBus();
eventBus.publish(jobEventIdentifier, event.trim());
}
/**
* updateJobStatus allows you to simply update a Job's status from something like
* error to complete.
*
* @param job the Job you are setting the status for
* @param status the status you are setting
*/
public static void updateStatus(Job job, String status) {
checkInitialized();
status = status.trim().toLowerCase();
cluster.logger().info("Updating Job status for the job " + job.token() + ": " + status);
job.status = status;
job.updated = new Date();
jobs.set(job.token(), job);
publishEvent(job, status);
}
private Logger logger;
private Async async;
private EventBus eventBus;
private ExecutorService jobExecutorService;
private String nodeId = "";
private Handler<Message> jobEventBusHandler = null;
private long staleJobTimer = -1;
private String defaultJobLogPath = "./fs/logs/";
private int maxJvmSize = 512;
private int minJvmSize = 16;
private IMap<String, String> enabledExecutionFactories;
private long queuedJobCleanupTimer = -1;
private int maxQueuedJobs = 5;
private int minQueuedJobs = 1;
private boolean enableQueuedJobs = false;
// This will store nodeIds
private IMap<String, String> queuedJobMap;
private IMap<String, String> tmpQueuedJobMap;
private Map<String, Process> queuedJobs;
private Process initJobRunnerProcess(Job job, int minJvmSize, int maxJvmSize) throws IOException {
return initJobRunnerProcess(job, minJvmSize, maxJvmSize, defaultExecutionFactory, false, 60 * 24);
}
private Process initJobRunnerProcess(Job job, int minJvmSize, int maxJvmSize, String executionFactory) throws IOException {
return initJobRunnerProcess(job, minJvmSize, maxJvmSize, executionFactory, false, 60 * 24);
}
private Process initJobRunnerProcess(Job job, int minJvmSize, int maxJvmSize, String executionFactory, boolean queued) throws IOException {
return initJobRunnerProcess(job, minJvmSize, maxJvmSize, executionFactory, queued, 60 * 24);
}
private Process initJobRunnerProcess(Job job, int minJvmSize, int maxJvmSize, String executionFactory, boolean queued, long awaitTimeout) throws IOException {
// Is this a new status??
job.status = "waiting";
// Each Script must be ran by the
// script runner process
String javaHome = System.getProperty("java.home");
String javaBin = javaHome +
File.separator + "bin" +
File.separator + "java";
String classpath = System.getProperty("java.class.path");
String className = JobRunner.class.getCanonicalName();
String clusterHost = "127.0.0.1:5271";
try {
Address address = cluster.hazelcast().getCluster().getLocalMember().getAddress();
clusterHost = address.getHost() + ":" + address.getPort();
} catch (Exception ignored) {
}
JsonArray nodeList = new JsonArray();
nodeList.add(clusterHost);
cluster.hazelcast().getCluster().getMembers().forEach(member -> {
Address memberAddress = member.getAddress();
String address = memberAddress.getHost() + ":" + memberAddress.getPort();
if(!nodeList.contains(address)){
nodeList.add(address);
}
});
String nodeListStr = Base64.encodeBytes(nodeList.encode(true).getBytes(), Base64.DONT_BREAK_LINES);
String jobToken = job.token();
String[] args;
String configPath = cluster().config().getConfigPath();
if (queued) {
jobToken = "await_" + jobToken;
args = new String[]{
javaBin, "-Xms" + minJvmSize + "m", "-Xmx" + maxJvmSize + "m", "-cp", classpath,
className, cluster().manager().nodeId(), nodeListStr, configPath, jobToken, executionFactory,
String.valueOf(awaitTimeout)
};
} else {
args = new String[]{
javaBin, "-Xms" + minJvmSize + "m", "-Xmx" + maxJvmSize + "m", "-cp", classpath,
className, cluster().manager().nodeId(), nodeListStr, configPath, jobToken, executionFactory
};
}
ProcessBuilder builder = new ProcessBuilder(args);
Path logFile = Paths.get(defaultJobLogPath + job.token() + ".log");
Path logFolder = Paths.get(defaultJobLogPath);
if (!Files.exists(logFolder)) {
Files.createDirectories(logFolder);
}
if (!Files.exists(logFile)) {
Files.createFile(logFile);
}
builder.redirectError(logFile.toAbsolutePath().toFile());
logger.info("Starting process for the job \"" + job.token() + "\"...");
return builder.start();
}
private void initQueuedJobs() {
initQueuedJobs(minJvmSize, maxJvmSize, 60 * 24, maxQueuedJobs);
}
private void initQueuedJobs(int count) {
initQueuedJobs(minJvmSize, maxJvmSize, 60 * 24, count);
}
private void initQueuedJobs(int minJvmSize, int count) {
initQueuedJobs(minJvmSize, maxJvmSize, 60 * 24, count);
}
private void initQueuedJobs(int minJvmSize, int maxJvmSize, int count) {
initQueuedJobs(minJvmSize, maxJvmSize, 60 * 24, count);
}
private void initQueuedJobs(int minJvmSize, int maxJvmSize, int awaitTimeout, int count) {
if (!enableQueuedJobs) {
return;
}
if (queuedJobs == null) {
queuedJobs = new ConcurrentHashMap<>();
// Every 60 Seconds seems like a decent time to attempt to cleanup jobs
queuedJobCleanupTimer = async.setPeriodic(1000 * 60, event -> queuedJobs.forEach((s, process) -> {
if(!process.isAlive()){
queuedJobs.remove(s);
tmpQueuedJobMap.remove(s);
queuedJobMap.remove(s);
}
}));
}
// Add a job if the queued job size is less than the minimum
// Do not add a job if the queued size is greater than the maximum
while (minQueuedJobs > queuedJobs.size() || (maxQueuedJobs > queuedJobs.size() && count > 0)) {
count--;
Job tmpJob = new Job();
try {
// We are initializing the default ExecutionContextFactory since these jobs are queued
Process process = initJobRunnerProcess(tmpJob, minJvmSize, maxJvmSize, defaultExecutionFactory, true, awaitTimeout);
queuedJobs.put(tmpJob.token(), process);
// This will allow queued jobs to work
// from jobs submitted from another job
queuedJobMap.put(tmpJob.token(), this.nodeId);
} catch (IOException e) {
e.printStackTrace();
}
}
}
private void gracefullyStopQueuedJob(String jobToken){
if(tmpQueuedJobMap.containsKey(jobToken)){
// We cannot block the main event loop so we run the timeout
// in it's own thread
new Thread(() -> {
Thread.currentThread().setName(jobToken + "_queued_timeout");
HazelcastInstance hz = cluster.hazelcast();
// We need to remove this job tell the process to stop
// since we reached a timeout
ICountDownLatch runnerAwaitLatch = hz.getCountDownLatch("_jobrunner_await_" + jobToken);
if(runnerAwaitLatch.getCount() == 0){
// This will help ensure the jobrunner is in fact ready which is possible
ICountDownLatch awaitContinueLatch = hz.getCountDownLatch("_jobrunner_awaitcont_" + jobToken);
awaitContinueLatch.trySetCount(1);
// This means that the JobRunner hasn't finished starting.
try {
awaitContinueLatch.await(15, TimeUnit.SECONDS);
} catch (InterruptedException ignored) {
}
}
while (runnerAwaitLatch.getCount() > 0) {
runnerAwaitLatch.countDown();
}
}).run();
}
}
public Job getQueuedJob(){
return getQueuedJob(false, false);
}
public Job getQueuedJob(boolean random, boolean localOnly) {
if (queuedJobMap.size() > 0) {
List<String> queuedJobIds;
if(localOnly && (cluster().config().isRole("job") || cluster().config().isAll())){
queuedJobIds = new ArrayList<>(queuedJobMap.keySet(Predicates.equal("toString", this.nodeId)));
} else {
queuedJobIds = new ArrayList<>(queuedJobMap.keySet());
}
if(random){
// Let's attempt to get a random job
Collections.shuffle(queuedJobIds);
}
Iterator<String> iterator = queuedJobIds.iterator();
while (iterator.hasNext()) {
String jobId = queuedJobIds.iterator().next();
Job tmpJob = new Job();
tmpJob.loadJson(new JsonObject().putString("token", jobId));
String nodeId = queuedJobMap.remove(tmpJob.token());
if (!registeredJobServers.containsKey(nodeId)) {
continue;
}
// Let's store the serverId
tmpQueuedJobMap.put(tmpJob.token(), nodeId);
// Let's go ahead and set a timer that will ensure
// the retrieved queued job will shut down if it's not used.
// This will ensure there aren't any rogue jobs.
async.setTimer(60 * 1000, event -> {
try {
if(!jobs().containsKey(tmpJob.token())){
gracefullyStopQueuedJob(tmpJob.token());
}
} catch (Exception ignored){
}
});
return tmpJob;
}
}
return null;
}
@Override
public void start(Cluster owner) {
cluster = owner;
this.logger = cluster.logger();
this.async = cluster.async();
this.eventBus = cluster.eventBus();
Config config = cluster.config();
// We use this so we can queue up jobs that don't get processed due to load
jobsBeingExecuted = cluster.data().getSet("jobs.executing");
jobExecutionNodes = cluster.data().getMap("jobs.executing.nodes");
registeredJobServers = cluster.data().getMap("job.server.nodes");
queuedJobMap = cluster.data().getMap("jobs.queued");
tmpQueuedJobMap = cluster.data().getMap("jobs.queued.tmp");
enabledExecutionFactories = cluster.data().getMap("jobs.execution.factories");
jobs = jobs();
initialized = true;
IMap<String, Job> registeredServices = cluster.data().persistentMap("job.server.user.services");
// If we are not a job server or our role isn't configured for all
// then we do not need to continue.
if ((!config.isRole("job") && !config.isAll()) || cluster().manager().clientMode()) return;
JsonObject jobServerConfig = coreConfig().getObject("job", new JsonObject());
if (!jobServerConfig.containsField("max_jobs")) {
jobServerConfig.putNumber("max_jobs", 50);
}
if (!jobServerConfig.containsField("max_queued_jobs")) {
jobServerConfig.putNumber("max_queued_jobs", 5);
}
if (!jobServerConfig.containsField("min_queued_jobs")) {
jobServerConfig.putNumber("min_queued_jobs", 2);
}
if (!jobServerConfig.containsField("enable_queued_jobs")) {
jobServerConfig.putBoolean("enable_queued_jobs", true);
}
// Let's set some default execution factories..
if (!jobServerConfig.containsField("execution_factories")) {
JsonArray defaultExecutionFactories = new JsonArray();
// Let's go ahead and add the default execution factory
JsonObject defaultFactoryConf = new JsonObject();
defaultFactoryConf.putString("platformId", "default");
defaultFactoryConf.putString("executionFactory", defaultExecutionFactory);
// Default is always enabled - maybe this should be changed
//defaultFactoryConf.putBoolean("enabled", true);
// Let's go ahead and add the default execution factory
JsonObject helloWorldFactoryConf = new JsonObject();
helloWorldFactoryConf.putString("platformId", "hello");
helloWorldFactoryConf.putString("executionFactory", HelloWorldContextFactory.class.getCanonicalName());
helloWorldFactoryConf.putBoolean("enabled", true);
// Let's go ahead and add the default execution factory
JsonObject v8FactoryConf = new JsonObject();
v8FactoryConf.putString("platformId", "v8");
v8FactoryConf.putString("executionFactory", V8ContextFactory.class.getCanonicalName());
v8FactoryConf.putBoolean("enabled", false);
defaultExecutionFactories.add(defaultFactoryConf);
jobServerConfig.putArray("execution_factories", defaultExecutionFactories);
}
JsonArray executionFactories = jobServerConfig.getArray("execution_factories", new JsonArray());
for (Object factoryConf : executionFactories) {
if(factoryConf instanceof JsonObject){
JsonObject jsonFactoryConf = (JsonObject) factoryConf;
String platformId = jsonFactoryConf.getString("platformId", "");
String executionFactory = jsonFactoryConf.getString("executionFactory", "");
// We can skip this since it doesn't look like an execution factory
if(platformId.isEmpty() || executionFactory.isEmpty()){
continue;
}
if(jsonFactoryConf.getBoolean("enabled", false)){
enabledExecutionFactories.put(platformId, executionFactory);
} else {
// We need to remove it because all nodes
// need the same configuration when it comes to this
enabledExecutionFactories.remove(platformId);
}
}
}
// We need to ensure the default execution factory is always enabled
if(!enabledExecutionFactories.containsKey("default")){
enabledExecutionFactories.set("default", defaultExecutionFactory);
}
coreConfig().putObject("job", jobServerConfig);
config.save();
// END configuration settings.
// This value is used by _serverConfig
String jobServerType = jobServerConfig.getString("server_type", "all");
logger.info("Server type is \"" + jobServerType + "\"");
int maxJobs = jobServerConfig.getInteger("max_jobs");
this.nodeId = cluster.manager().nodeId();
// We need to store this information temporarily so we can save it in registeredJobServers
jobServerConfig.putBoolean("clientMode", cluster.manager().clientMode());
jobServerConfig.putString("nodeId", this.nodeId);
registeredJobServers.set(this.nodeId, jobServerConfig);
jobExecutorService = Executors.newFixedThreadPool(maxJobs + 5); // Pull the default amount of max jobs + 5 (Seems to be a fail safe)
// Job handler - this is an event bus handler that actually handles our job execution
jobEventBusHandler = (Message event) -> {
if (event.body() != null) {
if (event.body() instanceof String &&
jobs().containsKey(event.body().toString())) {
jobExecutorService.submit(new Runnable() {
@Override
public void run() {
// Here we will handle the actual processing of the job.
Job job = jobs().get(event.body().toString());
updateStatus(job, "processing");
// Create an ICountDownLatch so we can let the cluster know that we are not finished running this job.
ICountDownLatch globalJobFinishLatch = cluster.hazelcast().getCountDownLatch(job.token() + "_job_finish_latch");
globalJobFinishLatch.trySetCount(1);
// Store the current job token so other nodes can know that this job is being executed with a simple check
jobsBeingExecuted.add(job.token());
// Store the current node handling the execution of this job
jobExecutionNodes.set(job.token(), cluster().manager().nodeId());
// Begin timeout handling - This ensures jobs are not running forever
// By default all jobs are timed out at 15 minutes unless they are a service job
long defaultTimeout = TimeUnit.MINUTES.toMillis(15);
if (job.service) {
defaultTimeout = 0; // Service jobs do not have a timeout
}
String executionAddr = "job.server." + job.token() + ".execution";
long timeoutTimer = 0;
// If the defaultTimeout is set to 0 then we will not cause timeouts.. this could be a very dangerous feature. Use at own risk
if (defaultTimeout > 0) {
timeoutTimer = async.setTimer(defaultTimeout, aLong -> eventBus.publish(executionAddr, "timeout"));
}
// if script is null we will pull script data from job..
Job completedJob;
// This will help let us know that the job is being handled.
ILock executionLock = cluster.hazelcast().getLock("_job_lock_execution_" + job.token());
executionLock.lock();
// We store the process in an array so the execution handler
// can access it
Process[] process = new Process[1];
if (tmpQueuedJobMap.containsKey(job.token()) &&
queuedJobs.containsKey(job.token())) {
process[0] = queuedJobs.get(job.token());
queuedJobs.remove(job.token());
tmpQueuedJobMap.remove(job.token());
// Let's go ahead and initialize some n
if (queuedJobs.size() < minQueuedJobs) {
// We run this in it's own thread so we don't take up async
new Thread(() -> {
Thread.currentThread().setName("init-queued-jobs");
// Let's try to start two more queued jobs in it's place
initQueuedJobs(5);
}).run();
}
}
Buffer consoleBuffer = new Buffer();
// Let's allow the console buffer to be retrieved..
Handler<Message> jobPrintStreamHandler = message -> {
if (!(message.body() instanceof String)) return;
String method = (String) message.body();
if (method.equals("retrieve")) {
message.reply(consoleBuffer);
}
};
// We need a simple way to retrieve the console
eventBus.registerHandler("job.server." + job.token() + ".printStreamBuffer", jobPrintStreamHandler);
try {
final long finalTimeoutTimer = timeoutTimer;
// This is used so we can tell the job script execution to error/stop/timeout/ or halt
// Begin handler for job execution control
eventBus.registerHandler(executionAddr, new Handler<Message>() {
@Override
public void handle(io.jsync.eventbus.Message message) {
if (!(message.body() instanceof String)) return;
String method = (String) message.body();
logger.info("Execution event received \"" + method + "\"...");
if (method.equals("halt") || method.equals("stop") || method.equals("timeout") || method.equals("error")) {
// This is extremely important so we do not update the status of the job improperly
Job latestJob = jobs().get(job.token());
// IMPORTANT - We want to store this in the cache because we have halted execution.. which means something outside our scope stopped it.
// We can make it empty too because we don't care about the value..
// Remove this handler.. we don't need it anymore..
switch (method) {
case "error":
// Handle async error
// We don't need to do anything
break;
case "stop":
updateStatus(latestJob, "stopping");
break;
case "timeout":
updateStatus(latestJob, "timeout");
break;
}
Handler<Message> self = this;
new Thread(() -> {
try {
// This will go ahead and tell the JobRunner to gracefully terminate
eventBus.publish(executionAddr, "kill");
// We are going to tell the final shutdown to wait at least
// 30 seconds before we forcibly destroy it
process[0].waitFor(30, TimeUnit.SECONDS);
} catch (InterruptedException ignored) {
} finally {
process[0].destroyForcibly();
forceKillJob(latestJob);
eventBus.unregisterHandler(executionAddr, self);
}
}).run();
} else if (method.equals("cancel_timeout")) {
async.cancelTimer(finalTimeoutTimer);
logger.info("Canceling timeout for job " + job.token());
}
}
});
if (process[0] == null) {
logger.info("Starting process for the job " + job.token());
// Default should always be enabled
process[0] = initJobRunnerProcess(job, minJvmSize, maxJvmSize, enabledExecutionFactories.get(job.config.getString("platform", "default")));
} else {
if (process[0].isAlive()) {
logger.info("Continuing process for the job " + job.token());
// Do nothing
ICountDownLatch runnerAwaitLatch = cluster.hazelcast().getCountDownLatch("_jobrunner_await_" + job.token());
if(runnerAwaitLatch.getCount() == 0){
// This will help ensure the jobrunner is in fact ready which is possible
// that it's not
ICountDownLatch awaitContinueLatch = cluster.hazelcast().getCountDownLatch("_jobrunner_awaitcont_" + job.token());
awaitContinueLatch.trySetCount(1);
// This means that the JobRunner hasn't finished starting.
awaitContinueLatch.await(60, TimeUnit.SECONDS);
}
while (runnerAwaitLatch.getCount() > 0) {
runnerAwaitLatch.countDown();
}
} else {
process[0] = initJobRunnerProcess(job, minJvmSize, maxJvmSize);
}
}
// This is a method of keeping the console
// stored in memory in case we have to kill the process
InputStream consoleStream = process[0].getInputStream();
// Let's ensure we read the console stream.
new Thread(() -> {
try {
InputStreamReader isr = new InputStreamReader(consoleStream);
BufferedReader br = new BufferedReader(isr);
int c;
while ((c = br.read()) != -1) {
if (cluster().config().isDebug()) {
System.out.print(((char) c));
}
consoleBuffer.appendByte((byte) c);
}
} catch (IOException ignored) {
}
}).run();
// End handler for job execution control
process[0].waitFor();
} catch (IOException | InterruptedException e) {
logger.info("The job " + job.token() + " was interrupted.");
} finally {
completedJob = jobs().get(job.token());
// Let's store the output from the consoleBuffer object.
// This makes it easier to have actual console results
completedJob.results.putString("output", consoleBuffer.toString());
// Let's tell the job to kill itself
eventBus.publish(executionAddr, "kill");
if (process[0] != null) {
try {
process[0].waitFor(30, TimeUnit.SECONDS);
} catch (Exception e) {
e.printStackTrace();
}
logger.info("Destroying process for the job " + job.token());
process[0].destroyForcibly();
forceKillJob(completedJob);
}
eventBus.unregisterHandler("job.server." + job.token() + ".printStreamBuffer", jobPrintStreamHandler);
}
final Job finalJob = completedJob;
if (finalJob.results.containsField("error")) {
cluster.eventBus().publish("private.job." + job.token() + ".printStream", new Buffer(finalJob.results.getObject("error",
new JsonObject()).getString("message", "error") + "\n"));
}
// Timeout must be canceled..
// Cancel the timeout timer so we don't do weird stuff..
async.cancelTimer(timeoutTimer);
// We want to remove all reserved variables
// For some reason field names causes it to stay open
for (String key : finalJob.config.toMap().keySet()) {
if (key.startsWith("_")) {
finalJob.config.removeField(key);
}
}
// Ensure it gets stored
updateStatus(finalJob, "complete");
jobs().set(finalJob.token(), finalJob);
jobsBeingExecuted.remove(finalJob.token());
jobExecutionNodes.remove(finalJob.token());
if (finalJob.config.containsField("callbackUrl")) {
async.runOnContext(event -> {
try {
HttpClient httpClient = async.createHttpClient();
JsonObject formatted = new JsonObject();
formatted.putString("token", finalJob.token());
formatted.putValue("created", finalJob.created);
formatted.putValue("updated", finalJob.updated);
formatted.putString("status", finalJob.status);
JsonObject formattedResults = new JsonObject();
formattedResults.putBoolean("success", finalJob.results.getBoolean("success", false));
if (finalJob.results.containsField("error")) {
formattedResults.putObject("error", finalJob.results.getObject("error"));
}
formatted.putObject("results", formattedResults);
String postbackUrl = finalJob.config.getString("callbackUrl");
httpClient.post(postbackUrl, event12 -> logger.debug("callbackUrl Response Received: " + event12 + " (" + postbackUrl + ")")).putHeader("Content-Type", "application/json")
.putHeader("User-Agent", "Automately-Job-Callback")
.end(formatted.encode());
} catch (Exception ignored) {
}
});
}
while (globalJobFinishLatch.getCount() > 0) {
globalJobFinishLatch.countDown();
}
// This is a way to tell any waiting handlers that the job is indeed finished
eventBus.publish("job.server." + job.token() + ".finished", "finished");
}
});
}
}
};
// We register a handler so we have a place that receives events for jobs
cluster.eventBus().registerHandler("job.server." + this.nodeId, jobEventBusHandler);
maxJvmSize = jobServerConfig.getInteger("max_jvm_size", 512);
minJvmSize = jobServerConfig.getInteger("min_jvm_size", 16);
logger.info("Max jobs set to " + maxJobs);
maxQueuedJobs = jobServerConfig.getInteger("max_queued_jobs");
minQueuedJobs = jobServerConfig.getInteger("min_queued_jobs");
logger.info("Maximum queued jobs set to " + maxQueuedJobs);
logger.info("Minimum queued jobs set to " + minQueuedJobs);
enableQueuedJobs = jobServerConfig.getBoolean("enable_queued_jobs", true);
if (enableQueuedJobs) {
initQueuedJobs(maxQueuedJobs);
}
// This exists so we can ensure that all the data for the dataBus is pre-loaded for
// anything accessing it such as the DataBusObject
cluster.data().persistentMap("dataBus");
if (!cluster.hazelcast().getPartitionService().isClusterSafe()) {
// Let's go ahead and start up some stuff
// If the cluster is big it may take up to 10 minutes for it to be ready
cluster.hazelcast().getPartitionService().forceLocalMemberToBeSafe(10, TimeUnit.MINUTES);
}
// TODO improve startup scripts
// Startup Scripts are called right before any other job gets started when the JobServer first starts.
// This allows you to have scripts running on the server that can be handling many things
JsonArray scriptsToStart = jobServerConfig.getArray("startup_scripts", new JsonArray());
// Begin Startup Scripts
for (Object value : scriptsToStart) {
if (value instanceof String && value.toString().split(":").length > 1) {
String newVal = (String) value;
String user = newVal.split(":")[0];
String script = newVal.split(":")[1];
User mUser = UserData.getUserByUsername(user);
if (mUser != null) {
// TODO replace with UserFileSystem usage
if (VirtualFileSystem.containsUserFile(mUser, script)) {
VirtualFile file = VirtualFileSystem.getUserFile(mUser, script);
JsonObject scriptConfig = new JsonObject();
logger.info("Attempting to start a job for the startup script " + script);
scriptConfig.putString("scriptPath", file.pathAlias);
scriptConfig.putString("scriptData", VirtualFileSystem.readFileData(file).toString());
Job newJob = new Job();
newJob.config = new JsonObject().putObject("script", scriptConfig);
// we make sure service is false because the script will handle itself if it is a service
newJob.service = false;
newJob.serviceConfig = new JsonObject();
newJob.serviceName = ""; // Make it empty by default
newJob.userToken = mUser.token();
try {
newJob = submit(newJob);
logger.info("Started new startup job " + newJob.token() + " for the script " + script);
} catch (Exception e) {
logger.error("Failed to start new startup job " + newJob.token() + " for the script " + script);
}
} else {
logger.error("Failed to to start \"" + newVal + "\". The file " + script + " does not exist.");
}
} else {
logger.error("Failed to to start \"" + newVal + "\". The user " + user + " does not exist.");
}
}
}
// End Startup Scripts
CountDownLatch waitLatch = new CountDownLatch(1);
Timer startupTimer = new Timer();
startupTimer.schedule(new TimerTask() {
@Override
public void run() {
waitLatch.countDown();
}
}, 15000);
try {
waitLatch.await(2, TimeUnit.MINUTES);
} catch (InterruptedException e) {
logger.warn("Timeout reached while waiting for the startup script timer to finish.");
}
if (jobServerConfig.getBoolean("autostart_services", true)) {
// Begin the startup of all registered services.
for (Job job : registeredServices.values()) {
// We can go ahead and clone the job then submitted
Job newJob = new Job();
newJob.config = job.config;
newJob.service = false; // We set this to false because services will call initService
newJob.serviceConfig = job.serviceConfig;
newJob.serviceName = job.serviceName;
newJob.userToken = job.userToken;
// Ensure that we do not start up a service when there has already been a job started for one.
Collection<Job> existingServices = jobs().values(Predicates.and(Predicates.equal("userToken", newJob.userToken),
Predicates.equal("serviceName", newJob.serviceName),
Predicates.or(Predicates.equal("status", "running"),
Predicates.equal("status", "queued"),
Predicates.equal("status", "processing")
)));
if (!existingServices.isEmpty()) {
boolean alreadyRunning = true;
for (Job existing : existingServices) {
if (isStale(existing)) {
alreadyRunning = false;
logger.debug("The job " + existing.token() + " went stale.");
// This will attempt to kill it just in case
cluster.eventBus().publish("job.server." + existing.token() + ".execution", "kill");
// Just to tell other things waiting to finish it
cluster.eventBus().publish("job.server." + existing.token() + ".finished", "finished");
// This code is a last resort method of shutting down the job.
forceKillJob(existing);
} else {
// Set this back to true
alreadyRunning = true;
}
}
if (alreadyRunning) {
logger.error("Failed to start new service job " + newJob.token() + " for the service " + newJob.serviceName + " for the user " + newJob.userToken + " because a service already has been started.");
return;
}
}
try {
submit(newJob);
logger.debug("Started new service job " + newJob.token() + " for the service " + newJob.serviceName + " for the user " + newJob.userToken);
} catch (Exception e) {
logger.error("Failed to start new service job " + newJob.token() + " for the service " + newJob.serviceName + " for the user " + newJob.userToken);
}
}
}
if (!cluster.manager().clientMode()) {
ExecutorService staleExecutor = Executors.newSingleThreadExecutor();
Runnable staleJobHandler = () -> {
logger.debug("Processing old jobs.");
for (Job job : jobs().values()) {
if (isStale(job)) {
logger.debug("The job " + job.token() + " went stale.");
cluster.eventBus().publish("job.server." + job.token() + ".execution", "kill");
// Just to tell other things waiting to finish it
cluster.eventBus().publish("job.server." + job.token() + ".finished", "finished");
// Last resort method to kill the job
forceKillJob(job);
} else if (isJobExpired(job, 14)) {
logger.debug("Removing the job " + job.token() + " because it has expired. (over 14 days old)");
jobs().remove(job.token());
} else if (isJobExpired(job, 5)) {
logger.info("Scrubbing the job " + job.token() + " because it over 5 days old.");
try {
if (job.results != null && job.results.containsField("output")) {
job.results.putString("output", "Output Scrubbed");
}
job.config = new JsonObject();
job.updated = new Date();
jobs().set(job.token(), job);
} catch (Exception e) {
e.printStackTrace();
}
}
}
};
// Ensure we never block the main event loop
staleJobTimer = async.setPeriodic(TimeUnit.MINUTES.toMillis(30), event -> staleExecutor.execute(staleJobHandler));
// Adding timeout for first check
async.setTimer(15000, event -> staleExecutor.execute(staleJobHandler));
} else {
logger.warn("Not checking for stale jobs since we are in client mode.");
}
}
/**
* This method is used to submit a job to the cluster.
*
* @param job the Job you wish to send to the Cluster
* @return returns a new Job after it has been submitted returns null if it failed
*/
public Job submit(final Job job) {
if (job == null) {
throw new NullPointerException("Your job cannot be null.");
}
if (job.userToken == null || job.userToken.isEmpty()) {
throw new NullPointerException("Your job's userToken cannot be null or empty.");
}
if (job.service && job.serviceConfig == null) {
throw new IllegalArgumentException("Cannot start a new service job with an empty service config");
}
if (registeredJobServers.size() < 1) {
throw new RuntimeException("Cannot submit a job when there are no registered job servers.");
}
// retrieve it from enabledExecutionFactories
String platformId = job.config.getString("platform", "default");
// Here we will go ahead and try to retrieve
// a queued job to replace the given Job. If one
// is found then the token will be updated.
if ((!tmpQueuedJobMap.containsKey(job.token()) && !queuedJobMap.containsKey(job.token()) &&
queuedJobs != null && !queuedJobs.containsKey(job.token())) && platformId.equals("default")){
Job queuedJob = getQueuedJob();
if (queuedJob != null) {
// This will ensure that the tmpJob's token will be copied
job.loadJson(new JsonObject().putString("token", queuedJob.token()));
}
}
if(!platformId.equals("default") && !enabledExecutionFactories.containsKey(platformId)){
throw new RuntimeException(platformId + " is an invalid or disabled platformId!");
}
// We set the job as queued so other people know that
// the job is going to be processed in the cluster.
job.status = "queued";
// We must store this job inside the cluster
// so we can access it across multiple nodes.
jobs().set(job.token(), job);
if (!jobsBeingExecuted.contains(job.token())) {
final ILock handleLock = cluster().hazelcast().getLock("_job_lock_" + job.token());
if (!handleLock.isLocked()) {
try {
// We must get a lock for at least 5 minutes so we
// don't handle the job multiple times in the server
if (handleLock.tryLock()) {
User jobUser = UserData.getUserByToken(job.userToken);
if (jobUser != null) {
if (job.service) {
Meta maxServiceJobs = UserData.getMeta(jobUser, "max_service_jobs");
if (maxServiceJobs != null) {
if (maxServiceJobs.value instanceof Number) {
Number max = (Number) maxServiceJobs.value;
// Check for jobs owned by the user that are not lite jobs but are service and are running
EntryObject e = new PredicateBuilder().getEntryObject();
Predicate p = e.get("userToken").equal(jobUser.token())
.and(e.get("service").equal(true))
.and(e.get("status").equal("running"));
if (jobs().values(p).size() > max.intValue()) {
JsonObject newResults = new JsonObject();
newResults.putBoolean("success", false);
JsonObject error = new JsonObject();
error.putString("code", "Quota Reached");
error.putString("message", "You have reached your maximum amount of service jobs you can run at the same time.");
newResults.putObject("error", error);
job.status = "quota_reached";
job.results = newResults;
// Let's attempt to stop a queued job if it exists.
gracefullyStopQueuedJob(job.token());
jobs().set(job.token(), job);
return job;
}
}
}
} else {
// Check for the Maximum Concurrent Allowed Jobs Per User
Meta maxConcurrentJobs = UserData.getMeta(jobUser, "max_jobs");
if (maxConcurrentJobs != null) {
if (maxConcurrentJobs.value instanceof Number) {
Number max = (Number) maxConcurrentJobs.value;
// Check for jobs owned by the user that are not lite jobs and are not service and are running
EntryObject e = new PredicateBuilder().getEntryObject();
Predicate p = e.get("userToken").equal(jobUser.token())
.and(e.get("service").equal(false))
.and(e.get("status").equal("running"));
if (jobs().values(p).size() > max.intValue()) {
JsonObject newResults = new JsonObject();
newResults.putBoolean("success", false);
JsonObject error = new JsonObject();
error.putString("code", "Quota Reached");
error.putString("message", "You have reached your maximum amount of jobs you can run at the same time.");
newResults.putObject("error", error);
job.status = "quota_reached";
job.results = newResults;
// Let's attempt to stop a queued job if it exists.
gracefullyStopQueuedJob(job.token());
jobs().set(job.token(), job);
return job;
}
}
}
}
String jobServerToUse = null;
JsonObject jobConfig = job.config;
if (tmpQueuedJobMap.containsKey(job.token())) {
// We retrieve the server from here to utilize queued jobs
jobServerToUse = tmpQueuedJobMap.get(job.token());
logger.info("Submitting the queued job to the server \"" + jobServerToUse + "\"");
} else {
// This code cannot be used if hazelcast is in client mode for jCluster
if (coreConfig().getObject("job", new JsonObject()).getBoolean("execute_on_least_jobs", true)) {
JsonObject leastMemberConfig = null;
Set<String> keys;
if (jobConfig.containsField("_serverConfig")) {
keys = registeredJobServers.keySet(new JsonQueryPredicate(jobConfig.getObject("_server_config",
new JsonObject())));
} else {
keys = registeredJobServers.keySet();
}
for (String nodeId : keys) {
JsonObject memberConfig = registeredJobServers.get(nodeId);
if (leastMemberConfig != null) {
int memberSize = jobExecutionNodes.values(Predicates.equal("toString", nodeId)).size();
int leastMemberSize = jobExecutionNodes.values(Predicates.equal("toString", leastMemberConfig.getString("nodeId"))).size();
if (memberSize < leastMemberSize) {
leastMemberConfig = memberConfig;
}
} else {
leastMemberConfig = memberConfig;
}
}
if (leastMemberConfig != null) {
jobServerToUse = leastMemberConfig.getString("nodeId");
}
}
}
if (jobServerToUse == null) {
// Let's choose a server from random now since we haven't
// detected one we should use
Set<String> keys;
if (jobConfig.containsField("_serverConfig")) {
keys = registeredJobServers.keySet(new JsonQueryPredicate(jobConfig.getObject("_server_config",
new JsonObject())));
} else {
keys = registeredJobServers.keySet();
}
List<String> nList = new ArrayList<>(keys);
Collections.shuffle(nList);
jobServerToUse = nList.iterator().next();
}
// Now let's actually execute this job by publishing it to the cluster.
String serverId = "job.server." + jobServerToUse;
logger.info("Submitting the job " + job.token() + " to \"" + serverId + "\"");
cluster.eventBus().publish(serverId, job.token());
return job;
}
}
} catch (Exception e) {
e.printStackTrace();
} finally {
handleLock.unlock();
}
return job;
}
}
// We return null if the job cannot be submitted for some reason
return null;
}
private void forceKillJob(Job job) {
if (!Windows.isWindows()) {
try {
Runtime.getRuntime().exec("kill -9 `ps -eo pid,args --cols=10000 | awk '/" + job.token()
+ "/ && $1 != PROCINFO[\"pid\"] { print $1 }'` &> /dev/null");
} catch (Exception ignored) {
}
}
}
/**
* This is a simple utility to check if a job has expired.
*
* @param job
* @param days
* @return
*/
private boolean isJobExpired(Job job, int days) {
if (job == null) {
throw new NullPointerException();
}
String status = job.status;
// This means we are already processing it.
if (status.equals("running") || status.equals("queued") || status.equals("processing")) {
return false;
}
long howManyDays = TimeUnit.MILLISECONDS.toDays(((new Date())).getTime() - job.updated.getTime());
return howManyDays >= days;
}
@Override
public void stop() {
if (!nodeId.isEmpty() && jobEventBusHandler != null) {
logger.info("Shutting down the JobServer for the node " + nodeId);
// We register a handler so we have a place that receives events for jobs
cluster.eventBus().unregisterHandler("job.server." + nodeId, jobEventBusHandler);
registeredJobServers.remove(this.nodeId);
// We need to ensure we stop the processes for all the (queued jobs)
if (queuedJobs != null) {
for (Map.Entry<String, Process> queuedJob : queuedJobs.entrySet()) {
try {
Process process = queuedJob.getValue();
logger.info("Stopping process for the queued job " + queuedJob.getKey() + "...");
queuedJobs.remove(queuedJob.getKey());
queuedJobMap.remove(queuedJob.getKey());
tmpQueuedJobMap.remove(queuedJob.getKey());
process.destroyForcibly();
} catch (Exception ignored) {
}
}
}
if(queuedJobCleanupTimer > -1){
async.cancelTimer(queuedJobCleanupTimer);
}
if (staleJobTimer > -1) {
async.cancelTimer(staleJobTimer);
}
Collection<String> handlingJobs = jobExecutionNodes.keySet(Predicates.equal("toString", nodeId));
logger.debug("There are " + handlingJobs.size() + " being handled by the node " + this.nodeId);
// This latch will help us speed the shutdown up faster
CountDownLatch processLatch = new CountDownLatch(handlingJobs.size());
ExecutorService shutdownService = Executors.newCachedThreadPool();
// Let's automatically handle jobs for this node
for (String jobToken : handlingJobs) {
logger.debug("Attempting to cleanup the job " + jobToken);
Job job = jobs().get(jobToken);
if (job != null) {
// We can submit the shutdown to the shutdown service
// so we can process more than one job at a time.
shutdownService.submit(() -> {
try {
// We can tell whatever handler to let the job finish.
ICountDownLatch globalJobFinishLatch = cluster.hazelcast().getCountDownLatch(job.token() + "_job_finish_latch");
// This will tell the migration handler to start only if there is a jobserver available
if(jobExecutionNodes.size() > 0){
cluster.eventBus().publish("job.server." + job.token() + ".execution", "migrate");
}
if (job.service) {
logger.info("Attempting to migrate the job " + job.token() + "...");
ICountDownLatch serviceReadyLatch = cluster.hazelcast().getCountDownLatch(job.userToken + "_" + job.serviceName + "_service_ready_latch");
try {
// Let's go ahead and attempt to set the count giving
// the latch some time to wait. This is safe because if
// the service doesn't get migrated it will still be stopped
serviceReadyLatch.trySetCount(5);
serviceReadyLatch.await(15, TimeUnit.SECONDS);
} catch (Exception ignored) {
}
}
// Let's wait up to 5 seconds for the job
// to handle it's own migration.
try {
globalJobFinishLatch.await(5, TimeUnit.SECONDS);
} catch (InterruptedException ignored) {
}
// We are going to send a direct hook to to the job to tell it to halt
cluster.eventBus().publish("job.server." + job.token() + ".execution", "stop");
cluster.eventBus().publish("job.server." + job.token() + ".execution", "kill");
logger.debug("Waiting for the job " + job.token() + " to finish.");
try {
globalJobFinishLatch.await(60, TimeUnit.SECONDS);
} catch (InterruptedException e) {
e.printStackTrace();
}
jobExecutionNodes.remove(jobToken);
jobsBeingExecuted.remove(jobToken);
} catch (Exception e){
e.printStackTrace();
} finally {
processLatch.countDown();
}
});
}
}
try {
if(!processLatch.await(1, TimeUnit.MINUTES)){
// TODO Change Note
logger.warn("Failed to shutdown local jobs properly!");
}
jobExecutorService.shutdownNow();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
@Override
public String name() {
return getClass().getCanonicalName();
}
}