package automately.core.services.job; import automately.core.data.Job; import automately.core.data.Meta; import automately.core.data.User; import automately.core.data.UserData; import automately.core.data.comparators.JobComparator; import automately.core.data.predicates.JsonQueryPredicate; import automately.core.file.VirtualFile; import automately.core.file.VirtualFileSystem; import automately.core.services.core.AutomatelyService; import automately.core.services.job.execution.factories.hello.HelloWorldContextFactory; import automately.core.services.job.execution.factories.js.NativeJSContextFactory; import automately.core.services.job.execution.factories.v8.V8ContextFactory; import com.hazelcast.core.*; import com.hazelcast.nio.Address; import com.hazelcast.query.*; import io.jsync.Async; import io.jsync.Handler; import io.jsync.app.core.Cluster; import io.jsync.app.core.Config; import io.jsync.app.core.Logger; import io.jsync.buffer.Buffer; import io.jsync.eventbus.EventBus; import io.jsync.eventbus.Message; import io.jsync.http.HttpClient; import io.jsync.impl.Windows; import io.jsync.json.JsonArray; import io.jsync.json.JsonObject; import io.jsync.json.impl.Base64; import java.io.*; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.*; import java.util.concurrent.*; /** * JobServer handles all jobs. This is used to submit jobs to the cluster * for execution. */ public class JobServer extends AutomatelyService { // TODO Definitely complete javadocs private static boolean initialized = false; private static Cluster cluster = null; private static IMap<String, Job> jobs = null; private static ISet<String> jobsBeingExecuted = null; private static IMap<String, String> jobExecutionNodes = null; private static IMap<String, JsonObject> registeredJobServers = null; private static String defaultExecutionFactory = NativeJSContextFactory.class.getCanonicalName(); @Deprecated public static void setScriptContextFactory(String factory) { setDefaultExecutionFactory(factory); } @Deprecated public static String getScriptContextFactory() { return getDefaultExecutionFactory(); } public static void setDefaultExecutionFactory(String factory) { defaultExecutionFactory = factory; } public static String getDefaultExecutionFactory() { return defaultExecutionFactory; } public static boolean initialized(){ return initialized; } private static void checkInitialized(){ if(!initialized){ throw new RuntimeException("The JobServer has not been initialized yet!"); } } /** * isStale is used to check if an Automately Job is stale in the cluster. It will * return false if the job is not stale. When a job is stale it basically means that * * @param job the Job you wish to check * @return returns true if the job is stale */ public static boolean isStale(Job job) { checkInitialized(); if (job == null) { throw new NullPointerException(); } ICountDownLatch globalJobFinishLatch = cluster.hazelcast().getCountDownLatch(job.token() + "_job_finish_latch"); String status = job.status; // Return false since the job pretty much has already been handled if (status.equals("complete") || status.equals("halted") || status.equals("stopping") || status.equals("timeout")) { return false; } final boolean[] isStale = {false}; // Newest method for checking for stale jobs // TWe are checking to see if there is a lock on the job within the cluster // If the job is not being executed then there will be no lock so if it's locked // The job is not stale ILock executionLock = cluster.hazelcast().getLock("_job_lock_execution_" + job.token()); if(!executionLock.isLocked()){ isStale[0] = true; } // If the job is running, queued, processing we should definitely check if it is stale if (status.equals("running") || status.equals("queued") || status.equals("processing")) { // This means the job is being executed and a node is handling it. // So we should check if the node still exists if(jobsBeingExecuted.contains(job.token()) && jobExecutionNodes.containsKey(job.token())){ String nodeId = jobExecutionNodes.get(job.token()); // This means it was removed the server was removed. So the job is stale if(!registeredJobServers.containsKey(nodeId)){ isStale[0] = true; } } else if (!jobsBeingExecuted.contains(job.token())) { isStale[0] = true; } } if(isStale[0]){ job.status = "complete"; JsonObject error = new JsonObject(); error.putString("message", "The job has went stale. It is no longer being executed."); error.putString("code", "Stale Job"); job.results.putObject("error", error); job.results.putBoolean("success", false); jobs.set(job.token(), job); if (globalJobFinishLatch.trySetCount(1)) { while (globalJobFinishLatch.getCount() > 0){ globalJobFinishLatch.countDown(); } } jobsBeingExecuted.remove(job.token()); jobExecutionNodes.remove(job.token()); } return isStale[0]; } /** * getJob allows you to retrieve a User's Job from the Cluster via * it's token. * * @param user the User you wish to retrieve the Job from * @param jobToken the token for the Job you are attempting to find * @return returns the Job if it was found or null if it wasn't */ public static Job getJob(User user, String jobToken) { checkInitialized(); EntryObject e = new PredicateBuilder().getEntryObject(); Predicate predicate = e.get("userToken").equal(user.token()) .and(e.get("token").equal(jobToken)); for (Job job : jobs.values(predicate)) { if (job.token().equals(jobToken)) { return job; } } return null; } /** * getJobs retrieves a Collection<Job> for every single Job belonging * to the specified User. * * @param user the User you wish to retrieve the Job's for * @return returns a Colllection<Job> for the User */ public static Collection<Job> getJobs(User user) { checkInitialized(); EntryObject e = new PredicateBuilder().getEntryObject(); Predicate predicate = e.get("userToken").equal(user.token()); return jobs.values(predicate); } /** * getJobs retrieves a Collection<Job> for every single Job belonging * to the specified User. This returns 10 results by default. * * @param user the User you wish to retrieve the Job's for * @param page the page index starting from 0 you are looking for * @return returns a Colllection<Job> for the User */ public static Collection<Job> getJobs(User user, int page) { return getJobs(user, page, 10); } /** * getJobs retrieves a Collection<Job> for every single Job belonging * to the specified User. * * @param user the User you wish to retrieve the Job's for * @param page the page index starting from 0 you are looking for * @param count the number of results to return * @return returns a Colllection<Job> for the User */ public static Collection<Job> getJobs(User user, int page, int count) { checkInitialized(); EntryObject e = new PredicateBuilder().getEntryObject(); com.hazelcast.query.Predicate userJobsPredicate = e.get("userToken").equal(user.token()); if (page < 0) { page = 0; } if (count < 0) { count = 10; } // Max Count is always 100 if (count > 100) count = 100; // This predicate uses the previous one.. and then sorts the posts by date... // IMPORTANT apparently can't lambda PagingPredicate pagingPredicate = new PagingPredicate(userJobsPredicate, new JobComparator(), count); Collection<Job> values = jobs.values(pagingPredicate); if (count > pagingPredicate.getPage()) { while (page > pagingPredicate.getPage()) { pagingPredicate.nextPage(); } values = jobs.values(pagingPredicate); } return values; } /** * getRunningJobs will return a Collection<Job> containing all * of the "normal" running jobs for the user. * * @param user the User you wish to retrieve the Collection<Job> for * @return the Collection<Job> you are retrieving */ public static Collection<Job> getRunningJobs(User user) { checkInitialized(); EntryObject e = new PredicateBuilder().getEntryObject(); Predicate predicate = e.get("userToken").equal(user.token()) .and(e.get("service").equal(false)) .and(e.get("status").equal("running")); // We only return running service jobs return jobs.values(predicate); } /** * getRunningServices will return a Collection<Job> containing all * of the running service jobs for the user. * * @param user the User you wish to retrieve the Collection<Job> for * @return the Collection<Job> you are retrieving */ public static Collection<Job> getRunningServices(User user) { checkInitialized(); EntryObject e = new PredicateBuilder().getEntryObject(); Predicate predicate = e.get("userToken").equal(user.token()) .and(e.get("service").equal(true)) .and(e.get("status").equal("running")); return jobs.values(predicate); } /** * getService is used to retrieve a service Job via it's serviceName. * * @param user the User you wish to retrieve the Job for * @param serviceName the serviceName for the Job you wish to retrieve * @return the Job you wish to retrieve or null if it doesn't exist */ public static Job getService(User user, String serviceName) { checkInitialized(); EntryObject e = new PredicateBuilder().getEntryObject(); Predicate predicate = e.get("userToken").equal(user.token()) .and(e.get("service").equal(true)) .and(e.get("status").equal("running")) .and(e.get("serviceName").equal(serviceName)); // We only want to get running services Collection<Job> values = jobs.values(predicate); if (values.iterator().hasNext()) { return values.iterator().next(); } return null; } /** * publishJobEvent is a tool used to publish events on the internal JobServer such as * halt, error, etc. * * @param job the Job you are publishing the event for * @param event the event you are publishing */ public static void publishEvent(Job job, String event) { checkInitialized(); String jobEventIdentifier = "job.server." + job.token() + ".events"; EventBus eventBus = cluster.eventBus(); eventBus.publish(jobEventIdentifier, event.trim()); } /** * updateJobStatus allows you to simply update a Job's status from something like * error to complete. * * @param job the Job you are setting the status for * @param status the status you are setting */ public static void updateStatus(Job job, String status) { checkInitialized(); status = status.trim().toLowerCase(); cluster.logger().info("Updating Job status for the job " + job.token() + ": " + status); job.status = status; job.updated = new Date(); jobs.set(job.token(), job); publishEvent(job, status); } private Logger logger; private Async async; private EventBus eventBus; private ExecutorService jobExecutorService; private String nodeId = ""; private Handler<Message> jobEventBusHandler = null; private long staleJobTimer = -1; private String defaultJobLogPath = "./fs/logs/"; private int maxJvmSize = 512; private int minJvmSize = 16; private IMap<String, String> enabledExecutionFactories; private long queuedJobCleanupTimer = -1; private int maxQueuedJobs = 5; private int minQueuedJobs = 1; private boolean enableQueuedJobs = false; // This will store nodeIds private IMap<String, String> queuedJobMap; private IMap<String, String> tmpQueuedJobMap; private Map<String, Process> queuedJobs; private Process initJobRunnerProcess(Job job, int minJvmSize, int maxJvmSize) throws IOException { return initJobRunnerProcess(job, minJvmSize, maxJvmSize, defaultExecutionFactory, false, 60 * 24); } private Process initJobRunnerProcess(Job job, int minJvmSize, int maxJvmSize, String executionFactory) throws IOException { return initJobRunnerProcess(job, minJvmSize, maxJvmSize, executionFactory, false, 60 * 24); } private Process initJobRunnerProcess(Job job, int minJvmSize, int maxJvmSize, String executionFactory, boolean queued) throws IOException { return initJobRunnerProcess(job, minJvmSize, maxJvmSize, executionFactory, queued, 60 * 24); } private Process initJobRunnerProcess(Job job, int minJvmSize, int maxJvmSize, String executionFactory, boolean queued, long awaitTimeout) throws IOException { // Is this a new status?? job.status = "waiting"; // Each Script must be ran by the // script runner process String javaHome = System.getProperty("java.home"); String javaBin = javaHome + File.separator + "bin" + File.separator + "java"; String classpath = System.getProperty("java.class.path"); String className = JobRunner.class.getCanonicalName(); String clusterHost = "127.0.0.1:5271"; try { Address address = cluster.hazelcast().getCluster().getLocalMember().getAddress(); clusterHost = address.getHost() + ":" + address.getPort(); } catch (Exception ignored) { } JsonArray nodeList = new JsonArray(); nodeList.add(clusterHost); cluster.hazelcast().getCluster().getMembers().forEach(member -> { Address memberAddress = member.getAddress(); String address = memberAddress.getHost() + ":" + memberAddress.getPort(); if(!nodeList.contains(address)){ nodeList.add(address); } }); String nodeListStr = Base64.encodeBytes(nodeList.encode(true).getBytes(), Base64.DONT_BREAK_LINES); String jobToken = job.token(); String[] args; String configPath = cluster().config().getConfigPath(); if (queued) { jobToken = "await_" + jobToken; args = new String[]{ javaBin, "-Xms" + minJvmSize + "m", "-Xmx" + maxJvmSize + "m", "-cp", classpath, className, cluster().manager().nodeId(), nodeListStr, configPath, jobToken, executionFactory, String.valueOf(awaitTimeout) }; } else { args = new String[]{ javaBin, "-Xms" + minJvmSize + "m", "-Xmx" + maxJvmSize + "m", "-cp", classpath, className, cluster().manager().nodeId(), nodeListStr, configPath, jobToken, executionFactory }; } ProcessBuilder builder = new ProcessBuilder(args); Path logFile = Paths.get(defaultJobLogPath + job.token() + ".log"); Path logFolder = Paths.get(defaultJobLogPath); if (!Files.exists(logFolder)) { Files.createDirectories(logFolder); } if (!Files.exists(logFile)) { Files.createFile(logFile); } builder.redirectError(logFile.toAbsolutePath().toFile()); logger.info("Starting process for the job \"" + job.token() + "\"..."); return builder.start(); } private void initQueuedJobs() { initQueuedJobs(minJvmSize, maxJvmSize, 60 * 24, maxQueuedJobs); } private void initQueuedJobs(int count) { initQueuedJobs(minJvmSize, maxJvmSize, 60 * 24, count); } private void initQueuedJobs(int minJvmSize, int count) { initQueuedJobs(minJvmSize, maxJvmSize, 60 * 24, count); } private void initQueuedJobs(int minJvmSize, int maxJvmSize, int count) { initQueuedJobs(minJvmSize, maxJvmSize, 60 * 24, count); } private void initQueuedJobs(int minJvmSize, int maxJvmSize, int awaitTimeout, int count) { if (!enableQueuedJobs) { return; } if (queuedJobs == null) { queuedJobs = new ConcurrentHashMap<>(); // Every 60 Seconds seems like a decent time to attempt to cleanup jobs queuedJobCleanupTimer = async.setPeriodic(1000 * 60, event -> queuedJobs.forEach((s, process) -> { if(!process.isAlive()){ queuedJobs.remove(s); tmpQueuedJobMap.remove(s); queuedJobMap.remove(s); } })); } // Add a job if the queued job size is less than the minimum // Do not add a job if the queued size is greater than the maximum while (minQueuedJobs > queuedJobs.size() || (maxQueuedJobs > queuedJobs.size() && count > 0)) { count--; Job tmpJob = new Job(); try { // We are initializing the default ExecutionContextFactory since these jobs are queued Process process = initJobRunnerProcess(tmpJob, minJvmSize, maxJvmSize, defaultExecutionFactory, true, awaitTimeout); queuedJobs.put(tmpJob.token(), process); // This will allow queued jobs to work // from jobs submitted from another job queuedJobMap.put(tmpJob.token(), this.nodeId); } catch (IOException e) { e.printStackTrace(); } } } private void gracefullyStopQueuedJob(String jobToken){ if(tmpQueuedJobMap.containsKey(jobToken)){ // We cannot block the main event loop so we run the timeout // in it's own thread new Thread(() -> { Thread.currentThread().setName(jobToken + "_queued_timeout"); HazelcastInstance hz = cluster.hazelcast(); // We need to remove this job tell the process to stop // since we reached a timeout ICountDownLatch runnerAwaitLatch = hz.getCountDownLatch("_jobrunner_await_" + jobToken); if(runnerAwaitLatch.getCount() == 0){ // This will help ensure the jobrunner is in fact ready which is possible ICountDownLatch awaitContinueLatch = hz.getCountDownLatch("_jobrunner_awaitcont_" + jobToken); awaitContinueLatch.trySetCount(1); // This means that the JobRunner hasn't finished starting. try { awaitContinueLatch.await(15, TimeUnit.SECONDS); } catch (InterruptedException ignored) { } } while (runnerAwaitLatch.getCount() > 0) { runnerAwaitLatch.countDown(); } }).run(); } } public Job getQueuedJob(){ return getQueuedJob(false, false); } public Job getQueuedJob(boolean random, boolean localOnly) { if (queuedJobMap.size() > 0) { List<String> queuedJobIds; if(localOnly && (cluster().config().isRole("job") || cluster().config().isAll())){ queuedJobIds = new ArrayList<>(queuedJobMap.keySet(Predicates.equal("toString", this.nodeId))); } else { queuedJobIds = new ArrayList<>(queuedJobMap.keySet()); } if(random){ // Let's attempt to get a random job Collections.shuffle(queuedJobIds); } Iterator<String> iterator = queuedJobIds.iterator(); while (iterator.hasNext()) { String jobId = queuedJobIds.iterator().next(); Job tmpJob = new Job(); tmpJob.loadJson(new JsonObject().putString("token", jobId)); String nodeId = queuedJobMap.remove(tmpJob.token()); if (!registeredJobServers.containsKey(nodeId)) { continue; } // Let's store the serverId tmpQueuedJobMap.put(tmpJob.token(), nodeId); // Let's go ahead and set a timer that will ensure // the retrieved queued job will shut down if it's not used. // This will ensure there aren't any rogue jobs. async.setTimer(60 * 1000, event -> { try { if(!jobs().containsKey(tmpJob.token())){ gracefullyStopQueuedJob(tmpJob.token()); } } catch (Exception ignored){ } }); return tmpJob; } } return null; } @Override public void start(Cluster owner) { cluster = owner; this.logger = cluster.logger(); this.async = cluster.async(); this.eventBus = cluster.eventBus(); Config config = cluster.config(); // We use this so we can queue up jobs that don't get processed due to load jobsBeingExecuted = cluster.data().getSet("jobs.executing"); jobExecutionNodes = cluster.data().getMap("jobs.executing.nodes"); registeredJobServers = cluster.data().getMap("job.server.nodes"); queuedJobMap = cluster.data().getMap("jobs.queued"); tmpQueuedJobMap = cluster.data().getMap("jobs.queued.tmp"); enabledExecutionFactories = cluster.data().getMap("jobs.execution.factories"); jobs = jobs(); initialized = true; IMap<String, Job> registeredServices = cluster.data().persistentMap("job.server.user.services"); // If we are not a job server or our role isn't configured for all // then we do not need to continue. if ((!config.isRole("job") && !config.isAll()) || cluster().manager().clientMode()) return; JsonObject jobServerConfig = coreConfig().getObject("job", new JsonObject()); if (!jobServerConfig.containsField("max_jobs")) { jobServerConfig.putNumber("max_jobs", 50); } if (!jobServerConfig.containsField("max_queued_jobs")) { jobServerConfig.putNumber("max_queued_jobs", 5); } if (!jobServerConfig.containsField("min_queued_jobs")) { jobServerConfig.putNumber("min_queued_jobs", 2); } if (!jobServerConfig.containsField("enable_queued_jobs")) { jobServerConfig.putBoolean("enable_queued_jobs", true); } // Let's set some default execution factories.. if (!jobServerConfig.containsField("execution_factories")) { JsonArray defaultExecutionFactories = new JsonArray(); // Let's go ahead and add the default execution factory JsonObject defaultFactoryConf = new JsonObject(); defaultFactoryConf.putString("platformId", "default"); defaultFactoryConf.putString("executionFactory", defaultExecutionFactory); // Default is always enabled - maybe this should be changed //defaultFactoryConf.putBoolean("enabled", true); // Let's go ahead and add the default execution factory JsonObject helloWorldFactoryConf = new JsonObject(); helloWorldFactoryConf.putString("platformId", "hello"); helloWorldFactoryConf.putString("executionFactory", HelloWorldContextFactory.class.getCanonicalName()); helloWorldFactoryConf.putBoolean("enabled", true); // Let's go ahead and add the default execution factory JsonObject v8FactoryConf = new JsonObject(); v8FactoryConf.putString("platformId", "v8"); v8FactoryConf.putString("executionFactory", V8ContextFactory.class.getCanonicalName()); v8FactoryConf.putBoolean("enabled", false); defaultExecutionFactories.add(defaultFactoryConf); jobServerConfig.putArray("execution_factories", defaultExecutionFactories); } JsonArray executionFactories = jobServerConfig.getArray("execution_factories", new JsonArray()); for (Object factoryConf : executionFactories) { if(factoryConf instanceof JsonObject){ JsonObject jsonFactoryConf = (JsonObject) factoryConf; String platformId = jsonFactoryConf.getString("platformId", ""); String executionFactory = jsonFactoryConf.getString("executionFactory", ""); // We can skip this since it doesn't look like an execution factory if(platformId.isEmpty() || executionFactory.isEmpty()){ continue; } if(jsonFactoryConf.getBoolean("enabled", false)){ enabledExecutionFactories.put(platformId, executionFactory); } else { // We need to remove it because all nodes // need the same configuration when it comes to this enabledExecutionFactories.remove(platformId); } } } // We need to ensure the default execution factory is always enabled if(!enabledExecutionFactories.containsKey("default")){ enabledExecutionFactories.set("default", defaultExecutionFactory); } coreConfig().putObject("job", jobServerConfig); config.save(); // END configuration settings. // This value is used by _serverConfig String jobServerType = jobServerConfig.getString("server_type", "all"); logger.info("Server type is \"" + jobServerType + "\""); int maxJobs = jobServerConfig.getInteger("max_jobs"); this.nodeId = cluster.manager().nodeId(); // We need to store this information temporarily so we can save it in registeredJobServers jobServerConfig.putBoolean("clientMode", cluster.manager().clientMode()); jobServerConfig.putString("nodeId", this.nodeId); registeredJobServers.set(this.nodeId, jobServerConfig); jobExecutorService = Executors.newFixedThreadPool(maxJobs + 5); // Pull the default amount of max jobs + 5 (Seems to be a fail safe) // Job handler - this is an event bus handler that actually handles our job execution jobEventBusHandler = (Message event) -> { if (event.body() != null) { if (event.body() instanceof String && jobs().containsKey(event.body().toString())) { jobExecutorService.submit(new Runnable() { @Override public void run() { // Here we will handle the actual processing of the job. Job job = jobs().get(event.body().toString()); updateStatus(job, "processing"); // Create an ICountDownLatch so we can let the cluster know that we are not finished running this job. ICountDownLatch globalJobFinishLatch = cluster.hazelcast().getCountDownLatch(job.token() + "_job_finish_latch"); globalJobFinishLatch.trySetCount(1); // Store the current job token so other nodes can know that this job is being executed with a simple check jobsBeingExecuted.add(job.token()); // Store the current node handling the execution of this job jobExecutionNodes.set(job.token(), cluster().manager().nodeId()); // Begin timeout handling - This ensures jobs are not running forever // By default all jobs are timed out at 15 minutes unless they are a service job long defaultTimeout = TimeUnit.MINUTES.toMillis(15); if (job.service) { defaultTimeout = 0; // Service jobs do not have a timeout } String executionAddr = "job.server." + job.token() + ".execution"; long timeoutTimer = 0; // If the defaultTimeout is set to 0 then we will not cause timeouts.. this could be a very dangerous feature. Use at own risk if (defaultTimeout > 0) { timeoutTimer = async.setTimer(defaultTimeout, aLong -> eventBus.publish(executionAddr, "timeout")); } // if script is null we will pull script data from job.. Job completedJob; // This will help let us know that the job is being handled. ILock executionLock = cluster.hazelcast().getLock("_job_lock_execution_" + job.token()); executionLock.lock(); // We store the process in an array so the execution handler // can access it Process[] process = new Process[1]; if (tmpQueuedJobMap.containsKey(job.token()) && queuedJobs.containsKey(job.token())) { process[0] = queuedJobs.get(job.token()); queuedJobs.remove(job.token()); tmpQueuedJobMap.remove(job.token()); // Let's go ahead and initialize some n if (queuedJobs.size() < minQueuedJobs) { // We run this in it's own thread so we don't take up async new Thread(() -> { Thread.currentThread().setName("init-queued-jobs"); // Let's try to start two more queued jobs in it's place initQueuedJobs(5); }).run(); } } Buffer consoleBuffer = new Buffer(); // Let's allow the console buffer to be retrieved.. Handler<Message> jobPrintStreamHandler = message -> { if (!(message.body() instanceof String)) return; String method = (String) message.body(); if (method.equals("retrieve")) { message.reply(consoleBuffer); } }; // We need a simple way to retrieve the console eventBus.registerHandler("job.server." + job.token() + ".printStreamBuffer", jobPrintStreamHandler); try { final long finalTimeoutTimer = timeoutTimer; // This is used so we can tell the job script execution to error/stop/timeout/ or halt // Begin handler for job execution control eventBus.registerHandler(executionAddr, new Handler<Message>() { @Override public void handle(io.jsync.eventbus.Message message) { if (!(message.body() instanceof String)) return; String method = (String) message.body(); logger.info("Execution event received \"" + method + "\"..."); if (method.equals("halt") || method.equals("stop") || method.equals("timeout") || method.equals("error")) { // This is extremely important so we do not update the status of the job improperly Job latestJob = jobs().get(job.token()); // IMPORTANT - We want to store this in the cache because we have halted execution.. which means something outside our scope stopped it. // We can make it empty too because we don't care about the value.. // Remove this handler.. we don't need it anymore.. switch (method) { case "error": // Handle async error // We don't need to do anything break; case "stop": updateStatus(latestJob, "stopping"); break; case "timeout": updateStatus(latestJob, "timeout"); break; } Handler<Message> self = this; new Thread(() -> { try { // This will go ahead and tell the JobRunner to gracefully terminate eventBus.publish(executionAddr, "kill"); // We are going to tell the final shutdown to wait at least // 30 seconds before we forcibly destroy it process[0].waitFor(30, TimeUnit.SECONDS); } catch (InterruptedException ignored) { } finally { process[0].destroyForcibly(); forceKillJob(latestJob); eventBus.unregisterHandler(executionAddr, self); } }).run(); } else if (method.equals("cancel_timeout")) { async.cancelTimer(finalTimeoutTimer); logger.info("Canceling timeout for job " + job.token()); } } }); if (process[0] == null) { logger.info("Starting process for the job " + job.token()); // Default should always be enabled process[0] = initJobRunnerProcess(job, minJvmSize, maxJvmSize, enabledExecutionFactories.get(job.config.getString("platform", "default"))); } else { if (process[0].isAlive()) { logger.info("Continuing process for the job " + job.token()); // Do nothing ICountDownLatch runnerAwaitLatch = cluster.hazelcast().getCountDownLatch("_jobrunner_await_" + job.token()); if(runnerAwaitLatch.getCount() == 0){ // This will help ensure the jobrunner is in fact ready which is possible // that it's not ICountDownLatch awaitContinueLatch = cluster.hazelcast().getCountDownLatch("_jobrunner_awaitcont_" + job.token()); awaitContinueLatch.trySetCount(1); // This means that the JobRunner hasn't finished starting. awaitContinueLatch.await(60, TimeUnit.SECONDS); } while (runnerAwaitLatch.getCount() > 0) { runnerAwaitLatch.countDown(); } } else { process[0] = initJobRunnerProcess(job, minJvmSize, maxJvmSize); } } // This is a method of keeping the console // stored in memory in case we have to kill the process InputStream consoleStream = process[0].getInputStream(); // Let's ensure we read the console stream. new Thread(() -> { try { InputStreamReader isr = new InputStreamReader(consoleStream); BufferedReader br = new BufferedReader(isr); int c; while ((c = br.read()) != -1) { if (cluster().config().isDebug()) { System.out.print(((char) c)); } consoleBuffer.appendByte((byte) c); } } catch (IOException ignored) { } }).run(); // End handler for job execution control process[0].waitFor(); } catch (IOException | InterruptedException e) { logger.info("The job " + job.token() + " was interrupted."); } finally { completedJob = jobs().get(job.token()); // Let's store the output from the consoleBuffer object. // This makes it easier to have actual console results completedJob.results.putString("output", consoleBuffer.toString()); // Let's tell the job to kill itself eventBus.publish(executionAddr, "kill"); if (process[0] != null) { try { process[0].waitFor(30, TimeUnit.SECONDS); } catch (Exception e) { e.printStackTrace(); } logger.info("Destroying process for the job " + job.token()); process[0].destroyForcibly(); forceKillJob(completedJob); } eventBus.unregisterHandler("job.server." + job.token() + ".printStreamBuffer", jobPrintStreamHandler); } final Job finalJob = completedJob; if (finalJob.results.containsField("error")) { cluster.eventBus().publish("private.job." + job.token() + ".printStream", new Buffer(finalJob.results.getObject("error", new JsonObject()).getString("message", "error") + "\n")); } // Timeout must be canceled.. // Cancel the timeout timer so we don't do weird stuff.. async.cancelTimer(timeoutTimer); // We want to remove all reserved variables // For some reason field names causes it to stay open for (String key : finalJob.config.toMap().keySet()) { if (key.startsWith("_")) { finalJob.config.removeField(key); } } // Ensure it gets stored updateStatus(finalJob, "complete"); jobs().set(finalJob.token(), finalJob); jobsBeingExecuted.remove(finalJob.token()); jobExecutionNodes.remove(finalJob.token()); if (finalJob.config.containsField("callbackUrl")) { async.runOnContext(event -> { try { HttpClient httpClient = async.createHttpClient(); JsonObject formatted = new JsonObject(); formatted.putString("token", finalJob.token()); formatted.putValue("created", finalJob.created); formatted.putValue("updated", finalJob.updated); formatted.putString("status", finalJob.status); JsonObject formattedResults = new JsonObject(); formattedResults.putBoolean("success", finalJob.results.getBoolean("success", false)); if (finalJob.results.containsField("error")) { formattedResults.putObject("error", finalJob.results.getObject("error")); } formatted.putObject("results", formattedResults); String postbackUrl = finalJob.config.getString("callbackUrl"); httpClient.post(postbackUrl, event12 -> logger.debug("callbackUrl Response Received: " + event12 + " (" + postbackUrl + ")")).putHeader("Content-Type", "application/json") .putHeader("User-Agent", "Automately-Job-Callback") .end(formatted.encode()); } catch (Exception ignored) { } }); } while (globalJobFinishLatch.getCount() > 0) { globalJobFinishLatch.countDown(); } // This is a way to tell any waiting handlers that the job is indeed finished eventBus.publish("job.server." + job.token() + ".finished", "finished"); } }); } } }; // We register a handler so we have a place that receives events for jobs cluster.eventBus().registerHandler("job.server." + this.nodeId, jobEventBusHandler); maxJvmSize = jobServerConfig.getInteger("max_jvm_size", 512); minJvmSize = jobServerConfig.getInteger("min_jvm_size", 16); logger.info("Max jobs set to " + maxJobs); maxQueuedJobs = jobServerConfig.getInteger("max_queued_jobs"); minQueuedJobs = jobServerConfig.getInteger("min_queued_jobs"); logger.info("Maximum queued jobs set to " + maxQueuedJobs); logger.info("Minimum queued jobs set to " + minQueuedJobs); enableQueuedJobs = jobServerConfig.getBoolean("enable_queued_jobs", true); if (enableQueuedJobs) { initQueuedJobs(maxQueuedJobs); } // This exists so we can ensure that all the data for the dataBus is pre-loaded for // anything accessing it such as the DataBusObject cluster.data().persistentMap("dataBus"); if (!cluster.hazelcast().getPartitionService().isClusterSafe()) { // Let's go ahead and start up some stuff // If the cluster is big it may take up to 10 minutes for it to be ready cluster.hazelcast().getPartitionService().forceLocalMemberToBeSafe(10, TimeUnit.MINUTES); } // TODO improve startup scripts // Startup Scripts are called right before any other job gets started when the JobServer first starts. // This allows you to have scripts running on the server that can be handling many things JsonArray scriptsToStart = jobServerConfig.getArray("startup_scripts", new JsonArray()); // Begin Startup Scripts for (Object value : scriptsToStart) { if (value instanceof String && value.toString().split(":").length > 1) { String newVal = (String) value; String user = newVal.split(":")[0]; String script = newVal.split(":")[1]; User mUser = UserData.getUserByUsername(user); if (mUser != null) { // TODO replace with UserFileSystem usage if (VirtualFileSystem.containsUserFile(mUser, script)) { VirtualFile file = VirtualFileSystem.getUserFile(mUser, script); JsonObject scriptConfig = new JsonObject(); logger.info("Attempting to start a job for the startup script " + script); scriptConfig.putString("scriptPath", file.pathAlias); scriptConfig.putString("scriptData", VirtualFileSystem.readFileData(file).toString()); Job newJob = new Job(); newJob.config = new JsonObject().putObject("script", scriptConfig); // we make sure service is false because the script will handle itself if it is a service newJob.service = false; newJob.serviceConfig = new JsonObject(); newJob.serviceName = ""; // Make it empty by default newJob.userToken = mUser.token(); try { newJob = submit(newJob); logger.info("Started new startup job " + newJob.token() + " for the script " + script); } catch (Exception e) { logger.error("Failed to start new startup job " + newJob.token() + " for the script " + script); } } else { logger.error("Failed to to start \"" + newVal + "\". The file " + script + " does not exist."); } } else { logger.error("Failed to to start \"" + newVal + "\". The user " + user + " does not exist."); } } } // End Startup Scripts CountDownLatch waitLatch = new CountDownLatch(1); Timer startupTimer = new Timer(); startupTimer.schedule(new TimerTask() { @Override public void run() { waitLatch.countDown(); } }, 15000); try { waitLatch.await(2, TimeUnit.MINUTES); } catch (InterruptedException e) { logger.warn("Timeout reached while waiting for the startup script timer to finish."); } if (jobServerConfig.getBoolean("autostart_services", true)) { // Begin the startup of all registered services. for (Job job : registeredServices.values()) { // We can go ahead and clone the job then submitted Job newJob = new Job(); newJob.config = job.config; newJob.service = false; // We set this to false because services will call initService newJob.serviceConfig = job.serviceConfig; newJob.serviceName = job.serviceName; newJob.userToken = job.userToken; // Ensure that we do not start up a service when there has already been a job started for one. Collection<Job> existingServices = jobs().values(Predicates.and(Predicates.equal("userToken", newJob.userToken), Predicates.equal("serviceName", newJob.serviceName), Predicates.or(Predicates.equal("status", "running"), Predicates.equal("status", "queued"), Predicates.equal("status", "processing") ))); if (!existingServices.isEmpty()) { boolean alreadyRunning = true; for (Job existing : existingServices) { if (isStale(existing)) { alreadyRunning = false; logger.debug("The job " + existing.token() + " went stale."); // This will attempt to kill it just in case cluster.eventBus().publish("job.server." + existing.token() + ".execution", "kill"); // Just to tell other things waiting to finish it cluster.eventBus().publish("job.server." + existing.token() + ".finished", "finished"); // This code is a last resort method of shutting down the job. forceKillJob(existing); } else { // Set this back to true alreadyRunning = true; } } if (alreadyRunning) { logger.error("Failed to start new service job " + newJob.token() + " for the service " + newJob.serviceName + " for the user " + newJob.userToken + " because a service already has been started."); return; } } try { submit(newJob); logger.debug("Started new service job " + newJob.token() + " for the service " + newJob.serviceName + " for the user " + newJob.userToken); } catch (Exception e) { logger.error("Failed to start new service job " + newJob.token() + " for the service " + newJob.serviceName + " for the user " + newJob.userToken); } } } if (!cluster.manager().clientMode()) { ExecutorService staleExecutor = Executors.newSingleThreadExecutor(); Runnable staleJobHandler = () -> { logger.debug("Processing old jobs."); for (Job job : jobs().values()) { if (isStale(job)) { logger.debug("The job " + job.token() + " went stale."); cluster.eventBus().publish("job.server." + job.token() + ".execution", "kill"); // Just to tell other things waiting to finish it cluster.eventBus().publish("job.server." + job.token() + ".finished", "finished"); // Last resort method to kill the job forceKillJob(job); } else if (isJobExpired(job, 14)) { logger.debug("Removing the job " + job.token() + " because it has expired. (over 14 days old)"); jobs().remove(job.token()); } else if (isJobExpired(job, 5)) { logger.info("Scrubbing the job " + job.token() + " because it over 5 days old."); try { if (job.results != null && job.results.containsField("output")) { job.results.putString("output", "Output Scrubbed"); } job.config = new JsonObject(); job.updated = new Date(); jobs().set(job.token(), job); } catch (Exception e) { e.printStackTrace(); } } } }; // Ensure we never block the main event loop staleJobTimer = async.setPeriodic(TimeUnit.MINUTES.toMillis(30), event -> staleExecutor.execute(staleJobHandler)); // Adding timeout for first check async.setTimer(15000, event -> staleExecutor.execute(staleJobHandler)); } else { logger.warn("Not checking for stale jobs since we are in client mode."); } } /** * This method is used to submit a job to the cluster. * * @param job the Job you wish to send to the Cluster * @return returns a new Job after it has been submitted returns null if it failed */ public Job submit(final Job job) { if (job == null) { throw new NullPointerException("Your job cannot be null."); } if (job.userToken == null || job.userToken.isEmpty()) { throw new NullPointerException("Your job's userToken cannot be null or empty."); } if (job.service && job.serviceConfig == null) { throw new IllegalArgumentException("Cannot start a new service job with an empty service config"); } if (registeredJobServers.size() < 1) { throw new RuntimeException("Cannot submit a job when there are no registered job servers."); } // retrieve it from enabledExecutionFactories String platformId = job.config.getString("platform", "default"); // Here we will go ahead and try to retrieve // a queued job to replace the given Job. If one // is found then the token will be updated. if ((!tmpQueuedJobMap.containsKey(job.token()) && !queuedJobMap.containsKey(job.token()) && queuedJobs != null && !queuedJobs.containsKey(job.token())) && platformId.equals("default")){ Job queuedJob = getQueuedJob(); if (queuedJob != null) { // This will ensure that the tmpJob's token will be copied job.loadJson(new JsonObject().putString("token", queuedJob.token())); } } if(!platformId.equals("default") && !enabledExecutionFactories.containsKey(platformId)){ throw new RuntimeException(platformId + " is an invalid or disabled platformId!"); } // We set the job as queued so other people know that // the job is going to be processed in the cluster. job.status = "queued"; // We must store this job inside the cluster // so we can access it across multiple nodes. jobs().set(job.token(), job); if (!jobsBeingExecuted.contains(job.token())) { final ILock handleLock = cluster().hazelcast().getLock("_job_lock_" + job.token()); if (!handleLock.isLocked()) { try { // We must get a lock for at least 5 minutes so we // don't handle the job multiple times in the server if (handleLock.tryLock()) { User jobUser = UserData.getUserByToken(job.userToken); if (jobUser != null) { if (job.service) { Meta maxServiceJobs = UserData.getMeta(jobUser, "max_service_jobs"); if (maxServiceJobs != null) { if (maxServiceJobs.value instanceof Number) { Number max = (Number) maxServiceJobs.value; // Check for jobs owned by the user that are not lite jobs but are service and are running EntryObject e = new PredicateBuilder().getEntryObject(); Predicate p = e.get("userToken").equal(jobUser.token()) .and(e.get("service").equal(true)) .and(e.get("status").equal("running")); if (jobs().values(p).size() > max.intValue()) { JsonObject newResults = new JsonObject(); newResults.putBoolean("success", false); JsonObject error = new JsonObject(); error.putString("code", "Quota Reached"); error.putString("message", "You have reached your maximum amount of service jobs you can run at the same time."); newResults.putObject("error", error); job.status = "quota_reached"; job.results = newResults; // Let's attempt to stop a queued job if it exists. gracefullyStopQueuedJob(job.token()); jobs().set(job.token(), job); return job; } } } } else { // Check for the Maximum Concurrent Allowed Jobs Per User Meta maxConcurrentJobs = UserData.getMeta(jobUser, "max_jobs"); if (maxConcurrentJobs != null) { if (maxConcurrentJobs.value instanceof Number) { Number max = (Number) maxConcurrentJobs.value; // Check for jobs owned by the user that are not lite jobs and are not service and are running EntryObject e = new PredicateBuilder().getEntryObject(); Predicate p = e.get("userToken").equal(jobUser.token()) .and(e.get("service").equal(false)) .and(e.get("status").equal("running")); if (jobs().values(p).size() > max.intValue()) { JsonObject newResults = new JsonObject(); newResults.putBoolean("success", false); JsonObject error = new JsonObject(); error.putString("code", "Quota Reached"); error.putString("message", "You have reached your maximum amount of jobs you can run at the same time."); newResults.putObject("error", error); job.status = "quota_reached"; job.results = newResults; // Let's attempt to stop a queued job if it exists. gracefullyStopQueuedJob(job.token()); jobs().set(job.token(), job); return job; } } } } String jobServerToUse = null; JsonObject jobConfig = job.config; if (tmpQueuedJobMap.containsKey(job.token())) { // We retrieve the server from here to utilize queued jobs jobServerToUse = tmpQueuedJobMap.get(job.token()); logger.info("Submitting the queued job to the server \"" + jobServerToUse + "\""); } else { // This code cannot be used if hazelcast is in client mode for jCluster if (coreConfig().getObject("job", new JsonObject()).getBoolean("execute_on_least_jobs", true)) { JsonObject leastMemberConfig = null; Set<String> keys; if (jobConfig.containsField("_serverConfig")) { keys = registeredJobServers.keySet(new JsonQueryPredicate(jobConfig.getObject("_server_config", new JsonObject()))); } else { keys = registeredJobServers.keySet(); } for (String nodeId : keys) { JsonObject memberConfig = registeredJobServers.get(nodeId); if (leastMemberConfig != null) { int memberSize = jobExecutionNodes.values(Predicates.equal("toString", nodeId)).size(); int leastMemberSize = jobExecutionNodes.values(Predicates.equal("toString", leastMemberConfig.getString("nodeId"))).size(); if (memberSize < leastMemberSize) { leastMemberConfig = memberConfig; } } else { leastMemberConfig = memberConfig; } } if (leastMemberConfig != null) { jobServerToUse = leastMemberConfig.getString("nodeId"); } } } if (jobServerToUse == null) { // Let's choose a server from random now since we haven't // detected one we should use Set<String> keys; if (jobConfig.containsField("_serverConfig")) { keys = registeredJobServers.keySet(new JsonQueryPredicate(jobConfig.getObject("_server_config", new JsonObject()))); } else { keys = registeredJobServers.keySet(); } List<String> nList = new ArrayList<>(keys); Collections.shuffle(nList); jobServerToUse = nList.iterator().next(); } // Now let's actually execute this job by publishing it to the cluster. String serverId = "job.server." + jobServerToUse; logger.info("Submitting the job " + job.token() + " to \"" + serverId + "\""); cluster.eventBus().publish(serverId, job.token()); return job; } } } catch (Exception e) { e.printStackTrace(); } finally { handleLock.unlock(); } return job; } } // We return null if the job cannot be submitted for some reason return null; } private void forceKillJob(Job job) { if (!Windows.isWindows()) { try { Runtime.getRuntime().exec("kill -9 `ps -eo pid,args --cols=10000 | awk '/" + job.token() + "/ && $1 != PROCINFO[\"pid\"] { print $1 }'` &> /dev/null"); } catch (Exception ignored) { } } } /** * This is a simple utility to check if a job has expired. * * @param job * @param days * @return */ private boolean isJobExpired(Job job, int days) { if (job == null) { throw new NullPointerException(); } String status = job.status; // This means we are already processing it. if (status.equals("running") || status.equals("queued") || status.equals("processing")) { return false; } long howManyDays = TimeUnit.MILLISECONDS.toDays(((new Date())).getTime() - job.updated.getTime()); return howManyDays >= days; } @Override public void stop() { if (!nodeId.isEmpty() && jobEventBusHandler != null) { logger.info("Shutting down the JobServer for the node " + nodeId); // We register a handler so we have a place that receives events for jobs cluster.eventBus().unregisterHandler("job.server." + nodeId, jobEventBusHandler); registeredJobServers.remove(this.nodeId); // We need to ensure we stop the processes for all the (queued jobs) if (queuedJobs != null) { for (Map.Entry<String, Process> queuedJob : queuedJobs.entrySet()) { try { Process process = queuedJob.getValue(); logger.info("Stopping process for the queued job " + queuedJob.getKey() + "..."); queuedJobs.remove(queuedJob.getKey()); queuedJobMap.remove(queuedJob.getKey()); tmpQueuedJobMap.remove(queuedJob.getKey()); process.destroyForcibly(); } catch (Exception ignored) { } } } if(queuedJobCleanupTimer > -1){ async.cancelTimer(queuedJobCleanupTimer); } if (staleJobTimer > -1) { async.cancelTimer(staleJobTimer); } Collection<String> handlingJobs = jobExecutionNodes.keySet(Predicates.equal("toString", nodeId)); logger.debug("There are " + handlingJobs.size() + " being handled by the node " + this.nodeId); // This latch will help us speed the shutdown up faster CountDownLatch processLatch = new CountDownLatch(handlingJobs.size()); ExecutorService shutdownService = Executors.newCachedThreadPool(); // Let's automatically handle jobs for this node for (String jobToken : handlingJobs) { logger.debug("Attempting to cleanup the job " + jobToken); Job job = jobs().get(jobToken); if (job != null) { // We can submit the shutdown to the shutdown service // so we can process more than one job at a time. shutdownService.submit(() -> { try { // We can tell whatever handler to let the job finish. ICountDownLatch globalJobFinishLatch = cluster.hazelcast().getCountDownLatch(job.token() + "_job_finish_latch"); // This will tell the migration handler to start only if there is a jobserver available if(jobExecutionNodes.size() > 0){ cluster.eventBus().publish("job.server." + job.token() + ".execution", "migrate"); } if (job.service) { logger.info("Attempting to migrate the job " + job.token() + "..."); ICountDownLatch serviceReadyLatch = cluster.hazelcast().getCountDownLatch(job.userToken + "_" + job.serviceName + "_service_ready_latch"); try { // Let's go ahead and attempt to set the count giving // the latch some time to wait. This is safe because if // the service doesn't get migrated it will still be stopped serviceReadyLatch.trySetCount(5); serviceReadyLatch.await(15, TimeUnit.SECONDS); } catch (Exception ignored) { } } // Let's wait up to 5 seconds for the job // to handle it's own migration. try { globalJobFinishLatch.await(5, TimeUnit.SECONDS); } catch (InterruptedException ignored) { } // We are going to send a direct hook to to the job to tell it to halt cluster.eventBus().publish("job.server." + job.token() + ".execution", "stop"); cluster.eventBus().publish("job.server." + job.token() + ".execution", "kill"); logger.debug("Waiting for the job " + job.token() + " to finish."); try { globalJobFinishLatch.await(60, TimeUnit.SECONDS); } catch (InterruptedException e) { e.printStackTrace(); } jobExecutionNodes.remove(jobToken); jobsBeingExecuted.remove(jobToken); } catch (Exception e){ e.printStackTrace(); } finally { processLatch.countDown(); } }); } } try { if(!processLatch.await(1, TimeUnit.MINUTES)){ // TODO Change Note logger.warn("Failed to shutdown local jobs properly!"); } jobExecutorService.shutdownNow(); } catch (InterruptedException e) { e.printStackTrace(); } } } @Override public String name() { return getClass().getCanonicalName(); } }