package io.seqware.pipeline.engines.whitestar; import com.google.common.base.Joiner; import com.google.common.collect.Lists; import com.google.common.util.concurrent.FutureCallback; import com.google.common.util.concurrent.Futures; import com.google.common.util.concurrent.ListenableFuture; import com.google.common.util.concurrent.ListeningExecutorService; import com.google.common.util.concurrent.MoreExecutors; import io.seqware.common.model.WorkflowRunStatus; import io.seqware.pipeline.SqwKeys; import io.seqware.pipeline.api.WorkflowEngine; import net.sourceforge.seqware.common.metadata.Metadata; import net.sourceforge.seqware.common.metadata.MetadataFactory; import net.sourceforge.seqware.common.model.WorkflowRun; import net.sourceforge.seqware.common.module.ReturnValue; import net.sourceforge.seqware.common.util.Log; import net.sourceforge.seqware.common.util.configtools.ConfigTools; import net.sourceforge.seqware.common.util.filetools.FileTools; import net.sourceforge.seqware.pipeline.workflowV2.AbstractWorkflowDataModel; import net.sourceforge.seqware.pipeline.workflowV2.engine.oozie.object.OozieJob; import net.sourceforge.seqware.pipeline.workflowV2.engine.oozie.object.WorkflowApp; import org.apache.commons.exec.CommandLine; import org.apache.commons.exec.DefaultExecutor; import org.apache.commons.exec.ExecuteException; import org.apache.commons.exec.Executor; import org.apache.commons.exec.PumpStreamHandler; import org.apache.commons.io.FileUtils; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.fs.Path; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.util.Collections; import java.util.List; import java.util.Set; import java.util.SortedSet; import java.util.TreeSet; import java.util.concurrent.Callable; import java.util.concurrent.ConcurrentSkipListSet; import java.util.concurrent.ExecutionException; import java.util.concurrent.Executors; import static net.sourceforge.seqware.common.util.Rethrow.rethrow; /** * This is a synchronous bare-bones implementation of the WorkflowEngine for prototyping and debugging. * * This re-uses much of the OozieWorkflowXml generation code in order to generate the required Bash scripts and supporting files. * * @author dyuen */ public class WhiteStarWorkflowEngine implements WorkflowEngine { // for this engine, just use the SWID private String jobId; private final boolean useSge; private final String threadsSgeParamFormat; private final String maxMemorySgeParamFormat; private File nfsWorkDir; private WorkflowApp workflowApp; private final boolean parallel; private Persistence persistence; /** * * @param objectModel * @param useSge * @param threadsSgeParamFormat * @param maxMemorySgeParamFormat * @param createDirectories * true when creating the engine to launch a job * @param parallel */ public WhiteStarWorkflowEngine(AbstractWorkflowDataModel objectModel, boolean useSge, String threadsSgeParamFormat, String maxMemorySgeParamFormat, boolean createDirectories, boolean parallel) { this.useSge = useSge; this.threadsSgeParamFormat = threadsSgeParamFormat; this.maxMemorySgeParamFormat = maxMemorySgeParamFormat; if (createDirectories) { this.nfsWorkDir = initNfsWorkDir(objectModel); } else { this.nfsWorkDir = null; } this.parallel = parallel; } private static File initNfsWorkDir(AbstractWorkflowDataModel model) { try { File nfsWorkDir = FileTools.createDirectoryWithUniqueName(new File(model.getEnv().getOOZIE_WORK_DIR()), "oozie"); boolean setWritable = nfsWorkDir.setWritable(true, false); if (!setWritable) { throw new RuntimeException("Unable to write to working directory"); } System.out.println("Using working directory: " + nfsWorkDir.getAbsolutePath()); return nfsWorkDir; } catch (IOException e) { throw rethrow(e); } } private static String seqwareJarPath(AbstractWorkflowDataModel objectModel) { return objectModel.getWorkflowBaseDir() + "/lib/seqware-distribution-" + objectModel.getTags().get("seqware_version") + "-full.jar"; } @Override public void prepareWorkflow(AbstractWorkflowDataModel objectModel) { prepareWorkflow(objectModel, null); } /** * * @param objectModel * @param nfsWorkDir * pass a working directory to skip creation of scripts in the generated-scripts */ public void prepareWorkflow(AbstractWorkflowDataModel objectModel, File nfsWorkDir) { // parse objectmodel if (nfsWorkDir == null) { this.populateNfsWorkDir(); } else { this.nfsWorkDir = nfsWorkDir; } /** regardless of the truth, tell the workflow app that we're always using sge in order to generate all generated scripts */ this.workflowApp = new WorkflowApp(objectModel, this.nfsWorkDir.getAbsolutePath(), new Path("dummy-value"), true, new File( seqwareJarPath(objectModel)), this.threadsSgeParamFormat, this.maxMemorySgeParamFormat); // go ahead and create the required script files if (nfsWorkDir == null) { this.workflowApp.serializeXML(); } this.jobId = objectModel.getWorkflow_run_accession(); this.persistence = new Persistence(this.nfsWorkDir); } @Override public ReturnValue runWorkflow() { return runWorkflow(new ConcurrentSkipListSet<String>()); } /** * Run a workflow, steps contained within set are skipped. * * @param set * a sorted set of workflow steps * @return */ public ReturnValue runWorkflow(SortedSet<String> set) { ReturnValue ret = new ReturnValue(ReturnValue.SUCCESS); // run this workflow synchronously List<List<OozieJob>> jobs = this.workflowApp.getOrderedJobs(); SortedSet<String> completedJobs = Collections.synchronizedSortedSet(set); int swid = Integer.parseInt(this.jobId); persistence.persistState(Integer.parseInt(this.jobId), completedJobs); for (int j = 0; j < jobs.size(); j++) { List<OozieJob> rowOfJobs = jobs.get(j); // determine number of possible retry loops int retryLoops = Integer.parseInt(ConfigTools.getSettingsValue(SqwKeys.OOZIE_RETRY_MAX)); int totalAttempts = retryLoops + 1; SortedSet<OozieJob> jobsLeft = Collections.synchronizedSortedSet(new TreeSet<>(rowOfJobs)); Set<OozieJob> jobsToRemove = new TreeSet<>(); // filter out completed jobs from a previous run for (OozieJob job : jobsLeft) { if (completedJobs.contains(job.getLongName())) { jobsToRemove.add(job); } } if (jobsToRemove.size() > 0) { Log.stdoutWithTime("Skipping " + Joiner.on(",").join(jobsToRemove) + " found in persistent set of completed steps"); jobsLeft.removeAll(jobsToRemove); } final SortedSet<OozieJob> jobsFailed = Collections.synchronizedSortedSet(new ConcurrentSkipListSet<OozieJob>()); for (int i = 1; i <= totalAttempts && !jobsLeft.isEmpty(); i++) { Log.stdoutWithTime("Row #" + j + " , Attempt #" + i + " out of " + totalAttempts + " : " + StringUtils.join(jobsLeft, ",")); // for each row of Jobs in the DAG ListeningExecutorService pool = null; try { if (this.parallel) { pool = MoreExecutors.listeningDecorator(Executors.newFixedThreadPool(jobsLeft.size())); } else { pool = MoreExecutors.listeningDecorator(Executors.newSingleThreadExecutor()); } // keep track of memory for submitted jobs and ensure it doesn't reach our limits int memoryLimit = Integer.parseInt(ConfigTools.getSettingsValue(SqwKeys.WHITESTAR_MEMORY_LIMIT)); if (!validateJobMemoryLimits(jobsLeft, memoryLimit, swid)) { alterWorkflowRunStatus(swid, WorkflowRunStatus.failed); return new ReturnValue(ReturnValue.FAILURE); } ListenableFuture<List<Integer>> batch; while (!jobsLeft.isEmpty()) { batch = scheduleMemoryLimitedBatch(jobsLeft, pool, jobsFailed, completedJobs); try { batch.get(); } catch (InterruptedException | ExecutionException ex) { Log.stdoutWithTime("\tBatch of jobs failed: " + Joiner.on(",").join(jobsFailed)); break; } } } finally { if (pool != null) { pool.shutdown(); } } jobsLeft.addAll(jobsFailed); jobsFailed.clear(); } if (!jobsLeft.isEmpty()) { alterWorkflowRunStatus(swid, WorkflowRunStatus.failed); return new ReturnValue(ReturnValue.FAILURE); } } alterWorkflowRunStatus(swid, WorkflowRunStatus.completed); return ret; } /** * Schedule a batch of jobs dependent on the memory limit. * * @param jobsLeft * a list of jobs to be scheduled, scheduled jobs will be removed * @param pool * an execution service to schedule jobs to * @param jobsFailed * will contain a list of jobs that failed * @param completedJobs * set of completed jobs * @return a future that will return when all jobs are complete */ private ListenableFuture<List<Integer>> scheduleMemoryLimitedBatch(final Set<OozieJob> jobsLeft, final ListeningExecutorService pool, final Set<OozieJob> jobsFailed, final SortedSet<String> completedJobs) { int memoryLimit = Integer.parseInt(ConfigTools.getSettingsValue(SqwKeys.WHITESTAR_MEMORY_LIMIT)); int memoryUsed = 0; List<OozieJob> currentBatch = Lists.newArrayList(); for (OozieJob job : jobsLeft) { int memoryAttempt = Integer.parseInt(job.getJobObject().getMaxMemory()); if (memoryUsed + memoryAttempt <= memoryLimit) { // add job to batch currentBatch.add(job); memoryUsed += memoryAttempt; } } Log.stdoutWithTime("\tSubmitting " + memoryUsed + "M batch with: " + Joiner.on(",").join(currentBatch)); List<ListenableFuture<Integer>> memoryBatchFutures = Lists.newArrayList(); for (final OozieJob job : currentBatch) { ListenableFuture<Integer> future = pool.submit(new ExecutionThread(job)); Futures.addCallback(future, new FutureCallback<Integer>() { @Override public void onSuccess(Integer result) { if (result != null && result == 0) { Log.stdoutWithTime("\tWorkflow step succeeded: " + job.getLongName()); completedJobs.add(job.getLongName()); persistence.persistState(Integer.parseInt(WhiteStarWorkflowEngine.this.jobId), completedJobs); } else { jobsFailed.add(job); Log.stdoutWithTime("\tWorkflow step failed: " + job.getLongName()); } } @Override public void onFailure(Throwable t) { Log.stdoutWithTime("\tWorkflow step " + job.getLongName() + " was interrupted or threw an exception"); jobsFailed.add(job); } }); memoryBatchFutures.add(future); } jobsLeft.removeAll(currentBatch); return Futures.allAsList(memoryBatchFutures); } /** * * @param jobsLeft * @param memoryLimit * @param swid * @return true iff all jobs are under the memory limit * @throws NumberFormatException */ private boolean validateJobMemoryLimits(Set<OozieJob> jobsLeft, int memoryLimit, int swid) { // validate that all jobs are under the memory limit for (OozieJob job : jobsLeft) { int memoryAttempt = Integer.parseInt(job.getJobObject().getMaxMemory()); if (memoryAttempt > memoryLimit) { Log.stdoutWithTime("Workflow step " + job.getLongName() + " exceeds the memory limit of " + memoryLimit); alterWorkflowRunStatus(swid, WorkflowRunStatus.failed); return false; } } return true; } private void alterWorkflowRunStatus(int jobId, WorkflowRunStatus status) { Log.stdoutWithTime("Setting workflow-run status to " + status + " for: " + jobId); // set the status to completed Metadata ws = MetadataFactory.get(ConfigTools.getSettings()); WorkflowRun workflowRun = ws.getWorkflowRun(jobId); workflowRun.setStatus(status); ws.updateWorkflowRun(workflowRun); } private final class ExecutionThread implements Callable<Integer> { private final OozieJob job; protected ExecutionThread(OozieJob job) { this.job = job; } @Override public Integer call() throws Exception { CommandLine cmdLine; File scriptsDir = job.getScriptsDir(); String optionsFileName = OozieJob.optsFileName(job.getLongName()); String runnerFileName = OozieJob.runnerFileName(job.getLongName()); if (!WhiteStarWorkflowEngine.this.useSge) { cmdLine = new CommandLine("bash"); } else { cmdLine = new CommandLine("qsub"); cmdLine.addArgument("-sync"); cmdLine.addArgument("yes"); cmdLine.addArgument("-@"); cmdLine.addArgument(scriptsDir.getAbsolutePath() + "/" + optionsFileName); } cmdLine.addArgument(scriptsDir.getAbsolutePath() + "/" + runnerFileName); Executor executor = new DefaultExecutor(); executor.setWorkingDirectory(scriptsDir); Log.stdoutWithTime("\tRunning command: " + cmdLine.toString()); // record output ourselves if not using sge if (!WhiteStarWorkflowEngine.this.useSge) { // we can only use the last 9 characters to fit into an int String time = String.valueOf(System.currentTimeMillis()).substring(4); ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); ByteArrayOutputStream errorStream = new ByteArrayOutputStream(); PumpStreamHandler streamHandler = new PumpStreamHandler(outputStream, errorStream); executor.setStreamHandler(streamHandler); // execute! try { executor.execute(cmdLine); // grab stdout and stderr } catch (ExecuteException e) { Log.debug("\tFatal error in workflow at step: " + job.getLongName()); return -1; } catch (IOException e) { throw rethrow(e); } finally { FileUtils.write(new File(scriptsDir.getAbsolutePath() + "/" + job.getLongName() + ".e" + time), outputStream.toString(StandardCharsets.UTF_8.name()), StandardCharsets.UTF_8); FileUtils.write(new File(scriptsDir.getAbsolutePath() + "/" + job.getLongName() + ".o" + time), errorStream.toString(StandardCharsets.UTF_8.name()), StandardCharsets.UTF_8); } } else { try { executor.execute(cmdLine); } catch (ExecuteException ex) { Log.debug("Fatal error in workflow at step: " + job.getLongName()); return -1; } } return 0; } } @Override public ReturnValue watchWorkflow(String jobToken) { Metadata ws = MetadataFactory.get(ConfigTools.getSettings()); WorkflowRun workflowRun = ws.getWorkflowRun(Integer.parseInt(jobToken)); Log.stdout("Workflow run " + jobToken + " is currently " + workflowRun.getStatus().name()); return new ReturnValue(workflowRun.getStatus() == WorkflowRunStatus.completed ? ReturnValue.SUCCESS : ReturnValue.FAILURE); } /** * */ private void populateNfsWorkDir() { File lib = new File(this.nfsWorkDir, "lib"); boolean mkdir = lib.mkdir(); if (!mkdir) { throw new RuntimeException("Unable to make directory in working dir"); } } @Override public String getWorkingDirectory() { return nfsWorkDir == null ? null : nfsWorkDir.getAbsolutePath(); } @Override public String getLookupToken() { return this.jobId; } }