/* * Copyright (C) 2011 SeqWare * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package net.sourceforge.seqware.pipeline.plugins; import io.seqware.Engines; import io.seqware.common.model.WorkflowRunStatus; import io.seqware.oozie.action.sge.JobStatus; import io.seqware.pipeline.SqwKeys; import java.io.File; import java.io.IOException; import java.io.StringBufferInputStream; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.text.MessageFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.Date; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.Set; import java.util.SortedMap; import java.util.TreeMap; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.regex.Matcher; import java.util.regex.Pattern; import net.sourceforge.seqware.common.metadata.Metadata; import net.sourceforge.seqware.common.model.WorkflowRun; import net.sourceforge.seqware.common.module.ReturnValue; import net.sourceforge.seqware.common.module.ReturnValue.ExitStatus; import net.sourceforge.seqware.common.util.Log; import net.sourceforge.seqware.common.util.filetools.FileTools; import net.sourceforge.seqware.common.util.filetools.FileTools.LocalhostPair; import net.sourceforge.seqware.pipeline.plugin.Plugin; import net.sourceforge.seqware.pipeline.plugin.PluginInterface; import net.sourceforge.seqware.pipeline.tools.RunLock; import net.sourceforge.seqware.pipeline.workflowV2.engine.oozie.object.OozieJob; import org.apache.commons.io.FileUtils; import org.apache.hadoop.conf.Configuration; import org.apache.oozie.client.OozieClient; import org.apache.oozie.client.WorkflowAction; import org.apache.oozie.client.WorkflowJob; import org.apache.xerces.util.XMLChar; import org.openide.util.lookup.ServiceProvider; /** * This plugin lets you monitor the status of running workflows and updates the metadata object with their status. * * @author boconnor * @version $Id: $Id */ @ServiceProvider(service = PluginInterface.class) public class WorkflowStatusChecker extends Plugin { public static final String WORKFLOW_RUN_ACCESSION = "workflow-run-accession"; private static final String METADATA_SYNC = "synch_for_metadata"; // variables for use in the app private String hostname = null; private String username = null; /** * <p> * Constructor for WorkflowStatusChecker. * </p> */ public WorkflowStatusChecker() { super(); parser.acceptsAll(Arrays.asList(WORKFLOW_RUN_ACCESSION, "wra"), "Optional: this will cause the program to only check the status of workflow run(s). For multiple runs, comma-separate with no spaces") .withRequiredArg().withValuesSeparatedBy(',').ofType(Integer.class); parser.acceptsAll(Arrays.asList("workflow-accession", "wa"), "Optional: this will cause the program to only check the status of workflow runs that are this type of workflow.") .withRequiredArg(); parser.acceptsAll( Arrays.asList("force-host", "fh"), "Optional: if specified, workflow runs scheduled to this specified host will be checked even if this is not the current host (a dangerous option).") .withRequiredArg(); parser.acceptsAll(Arrays.asList("check-failed", "cf"), "Optional: if specified, workflow runs that have previously failed will be re-checked."); parser.acceptsAll(Arrays.asList("threads-in-thread-pool", "tp"), "Optional: this will determine the number of threads to run with. Default: 1").withRequiredArg().ofType(Integer.class); } /** * {@inheritDoc} * * @return */ @Override public ReturnValue init() { RunLock.acquire(); LocalhostPair localhost = FileTools.getLocalhost(options); // returnValue can be null if we use forcehost if (localhost.returnValue != null && localhost.returnValue.getExitStatus() != ReturnValue.SUCCESS) { return (localhost.returnValue); } else { this.hostname = localhost.hostname; } // figure out the username if (this.config.get(SqwKeys.SW_REST_USER.getSettingKey()) == null || "".equals(this.config.get(SqwKeys.SW_REST_USER.getSettingKey()))) { Log.error("You must define " + SqwKeys.SW_REST_USER.getSettingKey() + " in your SeqWare settings file!"); return new ReturnValue(ExitStatus.FAILURE); } this.username = this.config.get(SqwKeys.SW_REST_USER.getSettingKey()); return new ReturnValue(); } /** * {@inheritDoc} * * @return */ @Override public ReturnValue do_test() { return new ReturnValue(); } /** * {@inheritDoc} * * @return */ @Override public ReturnValue do_run() { ReturnValue ret = new ReturnValue(ReturnValue.SUCCESS); // this checks workflows and writes their status back to the DB Set<WorkflowRun> runningWorkflows = new HashSet<>(); if (options.has(WORKFLOW_RUN_ACCESSION)) { List<Integer> swids = (List<Integer>) options.valuesOf(WORKFLOW_RUN_ACCESSION); for (Integer swid : swids) { WorkflowRun wr = this.metadata.getWorkflowRun(swid); runningWorkflows.add(wr); } } else { runningWorkflows.addAll(this.metadata.getWorkflowRunsByStatus(WorkflowRunStatus.running)); runningWorkflows.addAll(this.metadata.getWorkflowRunsByStatus(WorkflowRunStatus.pending)); runningWorkflows.addAll(this.metadata.getWorkflowRunsByStatus(WorkflowRunStatus.submitted_cancel)); runningWorkflows.addAll(this.metadata.getWorkflowRunsByStatus(WorkflowRunStatus.submitted_retry)); if (options.has("check-failed")) { runningWorkflows.addAll(this.metadata.getWorkflowRunsByStatus(WorkflowRunStatus.failed)); } } // setup thread pool ExecutorService pool; // Executors.newFixedThreadPool(4); if (options.has("threads-in-thread-pool")) { int threads = (Integer) options.valueOf("threads-in-thread-pool"); if (threads <= 0) { Log.fatal("Inappropriate number of threads selected"); ret = new ReturnValue(ReturnValue.FAILURE); return ret; } pool = Executors.newFixedThreadPool(threads); } else { pool = Executors.newSingleThreadExecutor(); } List<Future<?>> futures = new ArrayList<>(runningWorkflows.size()); // loop over running workflows and check their status for (WorkflowRun wr : runningWorkflows) { futures.add(pool.submit(new CheckerThread(wr))); } for (Future<?> future : futures) { try { future.get(); } catch (InterruptedException | ExecutionException ex) { Log.fatal(ex); } } pool.shutdown(); return ret; } /** * {@inheritDoc} * * @return */ @Override public ReturnValue clean_up() { RunLock.release(); return new ReturnValue(); } /** * {@inheritDoc} * * @return */ @Override public String get_description() { return "This plugin lets you monitor the status of running workflows and updates " + "the metadata object with their status. By default every running or unknown " + "workflow_run in the database will be checked if they are owned by the username in your .seqware/settings file " + "and the hostname is the same as 'hostname --long'. You can force the checking of workflows with a particular " + "host value but be careful with that."; } protected Metadata getMetadata() { return metadata; } private final class CheckerThread implements Runnable { private final WorkflowRun wr; protected CheckerThread(WorkflowRun wr) { this.wr = wr; } @Override public void run() { Log.info("ownerUserName: " + wr.getOwnerUserName()); Log.info("workflowAccession: " + wr.getWorkflowAccession()); Log.info("workflowRunID: " + wr.getWorkflowRunId()); // check that this workflow run matches the specified workflow if provided if (options.has("workflow-accession") && options.valueOf("workflow-accession") != null && !((String) options.valueOf("workflow-accession")).equals(wr.getWorkflowAccession().toString())) { return; } // ignore host matching when run accession is specified if (options.has(WORKFLOW_RUN_ACCESSION) == false) { // check the host is either overridden or this is the same host the // workflow was launched from if (options.has("force-host") && options.valueOf("force-host") != null && !((String) options.valueOf("force-host")).equals(wr.getHost())) { return; } else if (!options.has("force-host") && WorkflowStatusChecker.this.hostname != null && !WorkflowStatusChecker.this.hostname.equals(wr.getHost())) { return; } } // check the rest API username from SeqWare settings is the same username // in the DB if (WorkflowStatusChecker.this.username == null || wr.getOwnerUserName() == null || !WorkflowStatusChecker.this.username.equals(wr.getOwnerUserName())) { return; } if (Engines.isOozie(wr.getWorkflowEngine())) { checkOozie(); } else if (Engines.isWhiteStar(wr.getWorkflowEngine())) { checkWhiteStar(); } else { throw new RuntimeException("No other workflow engines currently supported"); } } private void checkWhiteStar() { String out = extractStdOut(wr, null); String err = extractStdErr(wr, null); synchronized (METADATA_SYNC) { wr.setStdErr(err); wr.setStdOut(out); WorkflowStatusChecker.this.metadata.updateWorkflowRun(wr); } } private void checkOozie() { try { OozieClient oc = new OozieClient((String) config.get(SqwKeys.OOZIE_URL.getSettingKey())); String jobId = wr.getStatusCmd(); if (jobId == null) { handlePreLaunch(); return; } WorkflowJob wfJob = oc.getJobInfo(jobId); if (wfJob == null) { throw new IllegalStateException("No Oozie job found for WorkflowRun: swid=" + wr.getSwAccession() + " oozie-id=" + jobId); } WorkflowRunStatus curSqwStatus = wr.getStatus(); WorkflowRunStatus nextSqwStatus; if (curSqwStatus == null) { nextSqwStatus = convertOozieToSeqware(wfJob.getStatus()); } else { switch (curSqwStatus) { case submitted_cancel: { switch (wfJob.getStatus()) { case PREP: case RUNNING: case SUSPENDED: // Note: here we treat SUSPENDED as running, so that it can be killed oc.kill(jobId); nextSqwStatus = WorkflowRunStatus.cancelled; break; default: // Let others propagate as normal nextSqwStatus = convertOozieToSeqware(wfJob.getStatus()); } break; } case submitted_retry: { switch (wfJob.getStatus()) { case SUSPENDED: oc.resume(jobId); nextSqwStatus = WorkflowRunStatus.pending; break; case FAILED: case KILLED: Properties conf = getCurrentConf(wfJob); // here we need specify the precise nodes to skip since OozieClient.RERUN_FAIL_NODES is bugged due to // OOZIE-1879 // conf.setProperty(OozieClient.RERUN_FAIL_NODES, "true"); WorkflowJob jobInfo = oc.getJobInfo(jobId); StringBuilder nodesToSkip = new StringBuilder(); for (WorkflowAction action : jobInfo.getActions()) { Log.debug("examining node: " + action.getName()); if (JobStatus.SUCCESSFUL.name().equals(action.getExternalStatus())) { if (nodesToSkip.length() != 0) { nodesToSkip.append(","); } nodesToSkip.append(action.getName()); } } Log.info("skipping nodes: " + nodesToSkip.toString()); conf.setProperty(OozieClient.RERUN_SKIP_NODES, nodesToSkip.toString()); oc.reRun(jobId, conf); nextSqwStatus = WorkflowRunStatus.pending; break; default: // Let others propagate as normal nextSqwStatus = convertOozieToSeqware(wfJob.getStatus()); } break; } default: nextSqwStatus = convertOozieToSeqware(wfJob.getStatus()); } } String err; String out; if (wr.getWorkflowEngine().equals("oozie-sge")) { Set<String> extIds = sgeIds(wfJob); out = extractStdOut(wr, extIds); err = extractStdErr(wr, extIds); } else { StringBuilder sb = new StringBuilder(); for (WorkflowAction action : wfJob.getActions()) { if (action.getErrorMessage() != null) { sb.append(MessageFormat.format(" Name: {0} Type: {1} ErrorMessage: {2}\n", action.getName(), action.getType(), action.getErrorMessage())); } } out = ""; err = sb.toString(); } synchronized (METADATA_SYNC) { wr.setStatus(nextSqwStatus); wr.setStdErr(err); wr.setStdOut(out); WorkflowStatusChecker.this.metadata.updateWorkflowRun(wr); } } catch (RuntimeException e) { throw e; } catch (Exception e) { throw new RuntimeException(e); } } private void handlePreLaunch() { switch (wr.getStatus()) { case submitted_cancel: // run cancelled before launching wr.setStatus(WorkflowRunStatus.cancelled); synchronized (METADATA_SYNC) { WorkflowStatusChecker.this.metadata.updateWorkflowRun(wr); } break; case submitted_retry: // retrying a pre-launch cancellation wr.setStatus(WorkflowRunStatus.submitted); synchronized (METADATA_SYNC) { WorkflowStatusChecker.this.metadata.updateWorkflowRun(wr); } break; default: throw new IllegalStateException("No Oozie job ID found for WorkflowRun: swid=" + wr.getSwAccession() + " status=" + wr.getStatus().name()); } } @SuppressWarnings("deprecation") private Properties getCurrentConf(WorkflowJob wfJob) { /* * Why this method is needed: * * To rerun an oozie job, one must pass in a Properties instance. * * The current conf of a WorkflowJob is only exposed via getConf() which does not return a Properties instance, but rather a * String of XML. * * The XML is not of a Properties, but rather of a hadoop Configuration! * * A hadoop Configuration instance cannot be loaded from a String, but only from resources or an input stream. * * Further, a hadoop Configuration instance does not expose a public method for obtaining a Properties representation. * * It does expose an iterator of Map.Entry objects (which is internally obtained from a Properties instance!). * * It'd be swell if these guys could just pick one representation, or at least an easy way to convert between them. */ Configuration conf = new Configuration(false); conf.addResource(new StringBufferInputStream(wfJob.getConf())); Properties props = new Properties(); for (Map.Entry<String, String> e : conf) { props.setProperty(e.getKey(), e.getValue()); } return props; } private WorkflowRunStatus convertOozieToSeqware(WorkflowJob.Status oozieStatus) { WorkflowRunStatus sqwStatus; /* * There's no analog to SUSPENDED on the seqware side, treating as failed so it can be picked up for retry */ switch (oozieStatus) { case PREP: case RUNNING: sqwStatus = WorkflowRunStatus.running; break; case SUSPENDED: case FAILED: sqwStatus = WorkflowRunStatus.failed; break; case KILLED: /* * NOTE: At the time of this writing, Oozie workflows that fail due to an error have an oozie status of KILLED. This would * result in failed workflows appearing in seqware as 'cancelled'. * * To compensate for this idiosyncrasy, we will treat KILLED workflow runs as FAILED. Workflow runs that are cancelled via * seqware will correctly have their status properly set to 'cancelled', since we are aware of the intent. * * The drawback is that workflow runs killed via other means, e.g., HUE, will be propagated back to seqware as 'failed'. I * feel this is the best of the bad options. */ // sqwStatus = WorkflowRunStatus.cancelled; sqwStatus = WorkflowRunStatus.failed; break; case SUCCEEDED: sqwStatus = WorkflowRunStatus.completed; break; default: throw new RuntimeException("Unexpected oozie status value: " + oozieStatus); } return sqwStatus; } } private static final Pattern SGE_OUT_FILE = Pattern.compile(".+\\.o(\\d+)"); private static final Pattern SGE_ERR_FILE = Pattern.compile(".+\\.e(\\d+)"); private static SortedMap<Integer, File> sgeFiles(Pattern p, File dir, final Set<String> extIds) { SortedMap<Integer, File> idFiles = new TreeMap<>(); for (File f : dir.listFiles()) { Matcher m = p.matcher(f.getName()); if (m.find()) { String id = m.group(1); if (extIds != null && extIds.contains(id)) { idFiles.put(Integer.parseInt(id), f); } // don't filter anything if no filter specified if (extIds == null) { idFiles.put(Integer.parseInt(id), f); } } } return idFiles; } private static final Pattern SGE_FILE = Pattern.compile("(.+)\\.[eo]\\d+"); private static String sgeConcat(SortedMap<Integer, File> idFiles, String fullExtension) { StringBuilder sb = new StringBuilder(); for (Map.Entry<Integer, File> e : idFiles.entrySet()) { File f = e.getValue(); Matcher m = SGE_FILE.matcher(f.getName()); m.find(); String jobName = m.group(1); sb.append("-----------------------------------------------------------------------"); sb.append("\nJob Name: "); sb.append(jobName); sb.append("\nJob ID: "); sb.append(e.getKey()); sb.append("\nFile: "); sb.append(f.getAbsolutePath()); sb.append("\nUpdated: "); sb.append(new Date(f.lastModified())); sb.append("\nContents Excerpt:\n"); try { sb.append(stripInvalidXmlCharacters(FileUtils.readFileToString(f))); } catch (IOException ex) { sb.append(" *** ERROR READING FILE: "); sb.append(ex.getMessage()); sb.append(" ***"); } if (sb.charAt(sb.length() - 1) != '\n') { sb.append("\n"); } // check if file exists before pointing people at it Path fullFile = Paths.get(f.getAbsoluteFile().getParent() + "/" + jobName + "." + fullExtension); if (Files.exists(fullFile)) { sb.append("\nFull output: ").append(f.getAbsoluteFile().getParent()).append("/").append(jobName).append(".") .append(fullExtension).append('\n'); } sb.append("-----------------------------------------------------------------------\n\n"); } return sb.toString(); } private static Set<String> sgeIds(WorkflowJob wf) { List<WorkflowAction> actions = wf.getActions(); final Set<String> extIds = new HashSet<>(); for (WorkflowAction a : actions) { if (a == null) { Log.fatal("Null action in Oozie provided list of actions in " + wf.toString()); continue; } String extId = a.getExternalId(); extIds.add(extId); } return extIds; } /** * Stolen from https://stackoverflow.com/questions/93655/stripping-invalid-xml-characters-in-java/9635310#9635310 * * @param input * @return */ public static String stripInvalidXmlCharacters(String input) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < input.length(); i++) { char c = input.charAt(i); if (XMLChar.isValid(c)) { sb.append(c); } } return sb.toString(); } /** * Extract stderr from a workflow run * * @param wr * @param set * filter SGE results using a particular id * @return */ public static String extractStdErr(WorkflowRun wr, Set<String> set) { String err; File dir = OozieJob.scriptsDir(wr.getCurrentWorkingDir()); if (dir.exists()) { err = sgeConcat(sgeFiles(SGE_ERR_FILE, dir, null), "stderr"); } else { // working dir has been deleted, do not wipe-out the stored output err = wr.getStdErr(); } return err; } /** * Extract stdout from a workflow run * * @param wr * @param set * filter SGE results using a particular id * @return */ public static String extractStdOut(WorkflowRun wr, Set<String> set) { String out; File dir = OozieJob.scriptsDir(wr.getCurrentWorkingDir()); if (dir.exists()) { out = sgeConcat(sgeFiles(SGE_OUT_FILE, dir, null), "stdout"); } else { // working dir has been deleted, do not wipe-out the stored output out = wr.getStdOut(); } return out; } }