package automately.core.services.job;
import automately.core.data.Job;
import automately.core.data.User;
import automately.core.file.VirtualFileService;
import automately.core.services.container.ContainerService;
import automately.core.services.core.AutomatelyService;
import automately.core.services.http.ClusteredHttpServer;
import automately.core.services.job.execution.ExecutionContext;
import automately.core.services.job.execution.ExecutionContextFactory;
import automately.core.services.job.execution.factories.js.NativeJSContextFactory;
import com.hazelcast.client.config.ClientConfig;
import com.hazelcast.client.config.ClientNetworkConfig;
import com.hazelcast.config.NetworkConfig;
import com.hazelcast.core.ICountDownLatch;
import com.hazelcast.core.LifecycleEvent;
import com.hazelcast.core.LifecycleListener;
import io.jsync.Handler;
import io.jsync.app.ClusterApp;
import io.jsync.app.core.Cluster;
import io.jsync.app.core.Config;
import io.jsync.app.core.Logger;
import io.jsync.buffer.Buffer;
import io.jsync.eventbus.Message;
import io.jsync.json.DecodeException;
import io.jsync.json.JsonArray;
import io.jsync.json.JsonObject;
import io.jsync.json.impl.Base64;
import java.lang.reflect.Constructor;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import static automately.core.services.job.JobServer.updateStatus;
public class JobRunner extends ClusterApp {
private static Job currentJob;
private static Runnable initHandler = null;
private static JsonArray nodeList = null;
public static void main(String[] args) {
try {
// We need to ensure that when this node is shutting down
// that we shut down. However that works..
String nodeId = args[0]; // TODO utilize the nodeId...
nodeList = new JsonArray();
try {
nodeList = new JsonArray(new Buffer(Base64.decode(args[1])).toString());
} catch (DecodeException e){
}
String configPath = args[2];
System.setProperty("execution.jobId", args[3]);
if (args.length > 4) {
System.setProperty("execution.factory", args[4]);
} else {
System.setProperty("execution.factory", NativeJSContextFactory.class.getCanonicalName());
}
if (args.length > 5) {
System.setProperty("execution.queue.timeout", args[5]);
}
ClusterApp.initialize(new JobRunner(),
"--join", "--cluster-role", "jobrunner",
"--hazelcast-mode", "client", "--disable-saving", "--config", configPath);
if (initHandler != null) {
initHandler.run();
}
} catch (Exception e) {
e.printStackTrace();
System.exit(1);
}
}
public static Job currentJob() {
return currentJob;
}
@Override
protected void prepareConfig(Config config) {
// This should be an effective number
config.rawConfig().putObject("cluster", config.rawConfig().getObject("cluster", new JsonObject()).putNumber("async_pool_size", 4));
}
@Override
protected void prepareCluster(Cluster cluster) {
try {
cluster.manager().addClientConfigHandler(config -> {
ClientNetworkConfig networkConfig = config.getNetworkConfig();
for (Object node : nodeList) {
if(node instanceof String){
networkConfig.addAddress((String) node);
}
}
config.setNetworkConfig(networkConfig);
});
cluster.addService(new VirtualFileService());
cluster.addService(new ClusteredHttpServer());
cluster.addService(new ContainerService());
cluster.addService(new JobServer());
// Let's add a service that handles the script execution
cluster.addService(new AutomatelyService() {
private String executionAddress = "";
/**
* This is called when we want to ensure the job gets updated in the cluster and stored
*/
private void updateJob() {
if (currentJob != null) {
cluster().data().persistentMap("jobs").set(currentJob.token(), currentJob);
}
}
@Override
public String name() {
return "JobRunnerService";
}
@Override
public void start(Cluster cluster) {
// We run the job within it's own
// thread so we don't black (start);
new Thread(new Runnable() {
@Override
public void run() {
Thread.currentThread().setName("jobrunner-main-thread");
Logger logger = cluster.logger();
try {
logger.info("Running the JobRunner service...");
ClassLoader classLoader = getClass().getClassLoader();
String contextFactory = System.getProperty("execution.factory", NativeJSContextFactory.class.getCanonicalName());
logger.debug("Attempting to load the context factory from " + contextFactory);
Class clazz = classLoader.loadClass(contextFactory);
if (clazz != null) {
Constructor<?> clu = clazz.getConstructor();
ExecutionContextFactory factory = (ExecutionContextFactory) clu.newInstance();
factory.initialize(cluster);
logger.debug("Loaded the context factory from " + contextFactory);
if (ExecutionContextFactory.class.isAssignableFrom(clazz)) {
String executionJobId = System.getProperty("execution.jobId");
// This means awe can go ahead and wait
// for a job to be received
if (executionJobId.startsWith("await_")) {
String queuedJobId = executionJobId.replaceFirst("await_", "");
ICountDownLatch runnerAwaitLatch = cluster.hazelcast().getCountDownLatch("_jobrunner_await_" + queuedJobId);
runnerAwaitLatch.trySetCount(1);
ICountDownLatch runnerContinueLatch = cluster.hazelcast().getCountDownLatch("_jobrunner_awaitcont_" + queuedJobId);
// Allow any waiting code to continue
while (runnerContinueLatch.getCount() > 0) {
runnerContinueLatch.countDown();
}
logger.info("Putting JobRunner into queued mode.");
try {
logger.debug("Waiting for queued job " + queuedJobId + " CountDownLatch to reach 0...");
int queueTimeout = Integer.parseInt(System.getProperty("execution.queue.timeout", "60"));
logger.info("Queued job " + queuedJobId + " await timeout set to " + queueTimeout + " minute(s).");
boolean waitResult = runnerAwaitLatch.await(queueTimeout, TimeUnit.MINUTES);
while (!waitResult && runnerAwaitLatch.getCount() > 0) {
logger.debug("Timeout reached for queued job " + queuedJobId + ". Waiting another " + queueTimeout + " minute(s)...");
waitResult = runnerAwaitLatch.await(queueTimeout, TimeUnit.MINUTES);
}
logger.debug("Finished waiting for the CountDownLatch to reach 0. JobId is " + queuedJobId);
System.setProperty("execution.jobId", queuedJobId);
} catch (Exception ignored) {
logger.warn("The queued job caught an exception while waiting to start.");
initHandler = () -> {
try {
logger.warn("Shutting down because await exception occurred.");
cluster().leave();
} catch (Exception e) {
e.printStackTrace();
}
System.exit(1);
};
return;
}
}
String jobToken = System.getProperty("execution.jobId");
currentJob = jobs().get(jobToken);
if (currentJob == null) {
logger.error("The job " + jobToken + " was not found.");
initHandler = () -> {
try {
logger.warn("Shutting down because the job " + jobToken + " does not exist.");
cluster().leave();
} catch (Exception e) {
e.printStackTrace();
}
System.exit(1);
};
return;
}
// Let's go ahead and set the ExecutionSecurityPolicy
// so we can get a good idea of what has access to what.
// This is important as we need to ensure we
// set the ExecutionSecurityPolicy. The ExecutionSecurityPolicy
// pretty much helps secure things a little more than normal.
/*Policy.setPolicy(new ExecutionSecurityPolicy());
System.setSecurityManager(new SecurityManager());*/
User user = users().get(currentJob.userToken);
updateStatus(currentJob, "running");
executionAddress = "job.server." + currentJob.token() + ".execution";
CountDownLatch jobAwaitLatch = new CountDownLatch(1);
Thread[] executionThread = new Thread[1];
// The killHandler is an attempt to gracefully stop
// script execution and shut down the cluster.
Handler<Message> killHandler = new Handler<Message>() {
@Override
public void handle(Message event) {
if (!(event.body() instanceof String)) return;
String method = (String) event.body();
if (method.equals("kill") || method.equals("error") || method.equals("halt")) {
logger.info("Execution event received \"" + method + "\"...");
// We must unregister the handler first so it does not continue running
cluster().eventBus().unregisterHandler(executionAddress, this);
logger.info("Killing the job \"" + currentJob.token() + "\"...");
try {
if(executionThread[0] != null){
// TODO verify this is working..
// Let's go ahead and trigger the "shutdown" handler.
// Your script must implement this properly..
cluster().eventBus().publish("job.server." + currentJob.token() + ".execution", "shutdown");
cluster().async().setTimer(4500, event12 -> {
// executionThread[0].interrupt(); This may not be needed
while (jobAwaitLatch.getCount() > 0){
jobAwaitLatch.countDown();
}
});
} else {
logger.warn("Forcing shutdown...");
try {
cluster.leave();
} catch (Exception ignored) {
// There can be errors while attempting to leave the cluster
}
System.exit(1);
}
} catch (Exception ignored) {
// We can ignore this
}
}
}
};
// This exists so we can actually
// attempt to stop things gracefully
cluster().eventBus().registerHandler(executionAddress, killHandler);
currentJob.config.putValue("_jobToken", currentJob.token());
JsonObject mergedResults = new JsonObject();
mergedResults.putBoolean("success", true);
executionThread[0] = new Thread(() -> {
try {
ExecutionContext context = factory.create(user, currentJob.config);
boolean broadcastPrintStream = currentJob.config.getBoolean("_broadcastPrintStream", false);
// since this may be cluster related
context.setConsoleHandler(event -> {
if (broadcastPrintStream) {
cluster.eventBus().publish("private.job." + currentJob.token() + ".printStream", new Buffer().appendBytes(event.getBytes()));
}
}).setExceptionHandler(t -> {
t.printStackTrace();
logger.info("Exception handler triggered.");
JsonObject error = new JsonObject();
error.putString("code", "Error");
error.putString("message", t.toString());
mergedResults.putObject("error", error);
mergedResults.putBoolean("success", false);
mergedResults.mergeIn(currentJob.results);
// This is safe since the error didn't happen on the execution thread
currentJob.results = mergedResults;
updateJob();
}).setUncaughtExceptionHandler(e -> {
e.printStackTrace();
// This is more for errors that are not thrown
// in the main execution thread.
logger.info("Uncaught Exception handler triggered.");
JsonObject error = new JsonObject();
error.putString("code", "Error");
error.putString("message", e.toString());
mergedResults.putObject("error", error);
mergedResults.putBoolean("success", false);
mergedResults.mergeIn(currentJob.results);
// This is safe since the error didn't happen on the execution thread
currentJob.results = mergedResults;
updateJob();
// This will go ahead and tell the job to stop because
// there was an uncaught error (async???)
cluster().eventBus().publish(executionAddress, "error");
}).execute();
logger.info("Execution has finished...");
} catch (Exception e) {
// Let's go ahead and store any errors
// that are throw when creating the ExecutionContext
JsonObject error = new JsonObject();
error.putString("code", "Error");
error.putString("message", e.toString());
mergedResults.putObject("error", error);
mergedResults.putBoolean("success", false);
} finally {
// When we are no longer executing we can simply call this.
jobAwaitLatch.countDown();
}
});
executionThread[0].setName("execution-thread");
long startTime = System.nanoTime();
executionThread[0].run();
// Let's go ahead and wait for the
// execution thread to finish
try {
jobAwaitLatch.await();
} catch (InterruptedException ignored){
}
// We want to ensure there are no fields that start with
// "_"
for (String field : currentJob.results.getFieldNames()) {
if (field.startsWith("_")) {
currentJob.results.removeField(field);
}
}
long totalTime = TimeUnit.NANOSECONDS.toSeconds(System.nanoTime() - startTime);
mergedResults.putNumber("timeToComplete", totalTime);
mergedResults.mergeIn(currentJob.results);
currentJob.results = mergedResults;
updateJob();
logger.info("Finished updating the job...");
logger.info("Finishing job...");
// Let's go ahead and attempt to leave the cluster.
cluster().leave();
logger.info("The job has completed...");
System.exit(0);
} else {
cluster().logger().error("\"" + System.getProperty("execution.factory",
NativeJSContextFactory.class.getCanonicalName()) + "\" is not a valid ExecutionContextFactory.");
try {
cluster.leave();
} catch (Exception ignored) {
}
System.exit(1);
}
}
} catch (Exception e) {
cluster().logger().error(e.getMessage());
try {
cluster().leave();
} catch (Exception ignored) {
}
System.exit(1);
}
}
}).run();
}
@Override
public void stop() {
try {
if (!cluster().hazelcast().getLifecycleService().isRunning()){
// We must kill the process if hazelcast is already dead
cluster().logger().error("Forcefully shutting down the process since hazelcast has died.");
System.exit(1);
}
} catch (Exception ignored) {
System.exit(1);
}
}
});
} catch (Exception e) {
throw new RuntimeException("There was an issue while attempting to prepare the cluster.", e);
}
}
}