package net.sourceforge.seqware.pipeline.workflowV2.engine.oozie.object; import io.seqware.pipeline.SqwKeys; import java.io.File; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Set; import net.sourceforge.seqware.common.util.Log; import net.sourceforge.seqware.common.util.configtools.ConfigTools; import net.sourceforge.seqware.common.util.maptools.ReservedIniKeys; import net.sourceforge.seqware.pipeline.workflowV2.AbstractWorkflowDataModel; import net.sourceforge.seqware.pipeline.workflowV2.model.AbstractJob; import net.sourceforge.seqware.pipeline.workflowV2.model.BashJob; import net.sourceforge.seqware.pipeline.workflowV2.model.Job; import net.sourceforge.seqware.pipeline.workflowV2.model.JobBatch; import net.sourceforge.seqware.pipeline.workflowV2.model.SqwFile; import org.apache.hadoop.fs.Path; import org.jdom.Element; /** * This class is responsible for the conversion of our AbstractWorkflowDataModel to an oozie xml workflow */ public class WorkflowApp { public static final String URIOOZIEWORKFLOW = "uri:oozie:workflow:0.4"; public static final org.jdom.Namespace NAMESPACE = org.jdom.Namespace.getNamespace(URIOOZIEWORKFLOW); public static final int BUCKET_SIZE = Integer .parseInt(ConfigTools.getSettings().containsKey(SqwKeys.OOZIE_BATCH_SIZE.getSettingKey()) ? ConfigTools.getSettings().get( SqwKeys.OOZIE_BATCH_SIZE.getSettingKey()) : "100"); public static final int THRESHOLD = Integer.valueOf(ConfigTools.getSettings() .containsKey(SqwKeys.OOZIE_BATCH_THRESHOLD.getSettingKey()) ? ConfigTools.getSettings().get( SqwKeys.OOZIE_BATCH_THRESHOLD.getSettingKey()) : "5"); // note: jobs must be prefixed with a valid character from the alphabet public static final String JOB_PREFIX = "s"; private final AbstractWorkflowDataModel wfdm; /** * a list of all jobs in order */ private final List<OozieJob> jobs; /** * name of the last join job in a workflow */ private String lastJoin; /** * map of files that are attached directly to the workflow instead of to a specific job used for constructing the graph */ private final Map<SqwFile, OozieJob> fileJobMap; private final String uniqueWorkingDir; private final Path hdfsWorkDir; private final boolean useSge; private final File seqwareJar; private final String threadsSgeParamFormat; private final String maxMemorySgeParamFormat; private final StringTruncator stringTruncator = new StringTruncator(); public WorkflowApp(AbstractWorkflowDataModel wfdm, String nfsWorkDir, Path hdfsWorkDir, boolean useSge, File seqwareJar, String threadsSgeParamFormat, String maxMemorySgeParamFormat) { this.wfdm = wfdm; this.uniqueWorkingDir = nfsWorkDir; this.hdfsWorkDir = hdfsWorkDir; this.jobs = new ArrayList<>(); this.fileJobMap = new HashMap<>(); this.useSge = useSge; this.seqwareJar = seqwareJar; this.threadsSgeParamFormat = threadsSgeParamFormat; this.maxMemorySgeParamFormat = maxMemorySgeParamFormat; this.parseDataModel(wfdm); } public List<List<OozieJob>> getOrderedJobs() { if (!this.jobs.isEmpty()) { OozieJob job0 = this.jobs.get(0); List<List<OozieJob>> graph = this.reOrganizeGraph(job0); return graph; } return new ArrayList<>(); } // TODO: Emit an end node that cleans up the generated script files. public Element serializeXML() { Element wf = new Element("workflow-app", NAMESPACE); wf.setAttribute("name", wfdm.getName()); if (!this.jobs.isEmpty()) { OozieJob job0 = this.jobs.get(0); Element start = new Element("start", NAMESPACE); start.setAttribute("to", job0.getShortName()); wf.addContent(start); List<List<OozieJob>> graph = this.reOrganizeGraph(job0); this.generateWorkflowXml2(wf, graph); } if (this.lastJoin != null && !this.lastJoin.isEmpty()) { Element lastJoinLocal = new Element("join", NAMESPACE); lastJoinLocal.setAttribute("name", this.lastJoin); lastJoinLocal.setAttribute("to", "done"); wf.addContent(lastJoinLocal); } Element done = new Element("action", NAMESPACE).setAttribute("name", "done"); Element fs = new Element("fs", NAMESPACE); Element delete = new Element("delete", NAMESPACE).setAttribute("path", hdfsWorkDir.toString()); Element ok = new Element("ok", NAMESPACE).setAttribute("to", "end"); Element error = new Element("error", NAMESPACE).setAttribute("to", "fail"); wf.addContent(done); done.addContent(fs); fs.addContent(delete); done.addContent(ok); done.addContent(error); Element kill = new Element("kill", NAMESPACE); kill.setAttribute("name", "fail"); Element message = new Element("message", NAMESPACE); message.setText("Java failed, error message[${wf:errorMessage(wf:lastErrorNode())}]"); kill.addContent(message); wf.addContent(kill); Element end = new Element("end", NAMESPACE); end.setAttribute("name", "end"); wf.addContent(end); return wf; } private void generateWorkflowXml2(Element rootElement, List<List<OozieJob>> graph) { OozieJob root = graph.get(0).get(0); Element currentE = root.serializeXML(); rootElement.addContent(currentE); for (int i = 1; i < graph.size(); i++) { currentE = this.generateNextLevelXml(rootElement, graph.get(i), currentE, i - 1); } // point the last one to end if (currentE.getName().equals("action")) { currentE.getChild("ok", NAMESPACE).setAttribute("to", "done"); } else { currentE.setAttribute("to", "done"); } } private Element generateNextLevelXml(Element rootElement, List<OozieJob> joblist, Element currentElement, int count) { Element ret; // currentElement could be action or join // need to set the next to, action: ok element, join: currentElement Element setNext = currentElement; if (currentElement.getName().equals("action")) { setNext = currentElement.getChild("ok", NAMESPACE); } if (joblist.size() > 1) { // has fork and join String forkName = "fork_" + count; setNext.setAttribute("to", forkName); Element forkE = new Element("fork", NAMESPACE); forkE.setAttribute("name", forkName); for (OozieJob job : joblist) { Element path = new Element("path", NAMESPACE); path.setAttribute("start", job.getShortName()); forkE.addContent(path); } rootElement.addContent(forkE); String joinName = "join_" + count; // add action for job for (OozieJob job : joblist) { job.setOkTo(joinName); rootElement.addContent(job.serializeXML()); } // add join element Element joinE = new Element("join", NAMESPACE); joinE.setAttribute("name", joinName); rootElement.addContent(joinE); ret = joinE; } else { OozieJob job = joblist.get(0); setNext.setAttribute("to", job.getShortName()); Element nextE = job.serializeXML(); rootElement.addContent(nextE); ret = nextE; } return ret; } private void parseDataModel(AbstractWorkflowDataModel wfdm) { boolean metadatawriteback = wfdm.isMetadataWriteBack(); final String workflowRunAccession = wfdm.getWorkflow_run_accession(); Set<OozieJob> parents = new LinkedHashSet<>(); // first job create dirs // mkdir data job AbstractJob abstractRootJob = new BashJob("createdirs"); abstractRootJob.getCommand().addArgument("mkdir -p provisionfiles; "); // check if there are user defined directory if (!wfdm.getDirectories().isEmpty()) { for (String dir : wfdm.getDirectories()) { abstractRootJob.getCommand().addArgument("mkdir -p " + dir + "; "); } } // 2GB should be more than enough for start_0 based on metrics in PDE dev, leaving a margin of safety String startMem = ConfigTools.getSettings().get(SqwKeys.SW_CONTROL_NODE_MEMORY.getSettingKey()); abstractRootJob.setMaxMemory(startMem == null ? "3000" : startMem); OozieJob oozieRootJob = new OozieBashJob(abstractRootJob, JOB_PREFIX + workflowRunAccession + "_start_" + this.jobs.size(), this.uniqueWorkingDir, this.useSge, this.seqwareJar, this.threadsSgeParamFormat, this.maxMemorySgeParamFormat, this.stringTruncator); oozieRootJob.setMetadataWriteback(metadatawriteback); // if has parent-accessions, assign it to first job Collection<String> parentAccession = wfdm.getParentAccessions(); if (parentAccession != null && !parentAccession.isEmpty()) { oozieRootJob.setParentAccessions(parentAccession); } if (workflowRunAccession != null && !workflowRunAccession.isEmpty()) { oozieRootJob.setWorkflowRunAccession(workflowRunAccession); oozieRootJob.setWorkflowRunAncesstor(true); } this.jobs.add(oozieRootJob); parents.add(oozieRootJob); // handles all provision file events not attached to jobs // mutates parents to store all provision in jobs handleUnattachedProvisionFileEvents(wfdm, metadatawriteback, workflowRunAccession, parents, abstractRootJob); // pre-create all Oozie jobs so that we can reference them when attaching parents for (AbstractJob job : wfdm.getWorkflow().getJobs()) { OozieJob oozieActualJob = this.createOozieJobObject(job, wfdm); oozieActualJob.setMetadataWriteback(metadatawriteback); if (workflowRunAccession != null && !workflowRunAccession.isEmpty()) { oozieActualJob.setWorkflowRunAccession(workflowRunAccession); } // SEQWARE-1804 transfer setParentAccessions information ala Pegasus version in // net.sourceforge.seqware.pipeline.workflowV2.engine.pegasus.object.Adag if (!job.getParentAccessions().isEmpty()) { oozieActualJob.setParentAccessions(job.getParentAccessions()); } this.jobs.add(oozieActualJob); } // need to remember the provisionOut and reset the job's children to // provisionout's children for (AbstractJob job : wfdm.getWorkflow().getJobs()) { OozieJob oozieActualJob = this.getOozieJobObject(job); Log.debug("Manipulating parents for " + oozieActualJob.getLongName()); for (Job parent : job.getParents()) { oozieActualJob.addParent(this.getOozieJobObject((AbstractJob) parent)); } /** * handle the provision file events that are associated with a specific job */ handleAttachedProvisionFileEventsForJob(job, oozieRootJob, metadatawriteback, workflowRunAccession, oozieActualJob, abstractRootJob, wfdm); // if this job has no parents, assume that the provision in events that we saw are the parents if (oozieActualJob.getParents().isEmpty()) { for (OozieJob parent : parents) { oozieActualJob.addParent(parent); } } } // all leaves (nodes that are not provision outs with no children) become parents of all provision outs this.linkLeafsAsProvisionOutParents(); // joins all leaves to an artificial join if necessary this.setEndJob(); // go through and this.setAccessionFileRelations(oozieRootJob); // go through and set stdout/stderr buffer sizes if they are not set by the workflow developer if (wfdm.getConfigs().containsKey(ReservedIniKeys.SEQWARE_LINES_NUMBER.getKey())) { for (AbstractJob job : wfdm.getWorkflow().getJobs()) { if (job.getCommand().getOutputLineCapacity() == null) { job.getCommand().setOutputLineCapacity( Integer.valueOf(wfdm.getConfigs().get(ReservedIniKeys.SEQWARE_LINES_NUMBER.getKey()))); } } } } /** * if the object model has any jobs that are leaves this method creates an artificial join and transitions the leaves to it */ private void setEndJob() { if (needLastJoin()) { // set a unique name for the join action in case of name conflict this.lastJoin = "join_" + Long.toString(System.nanoTime()); for (OozieJob job : this.jobs) { if (job.getChildren().isEmpty()) { job.setOkTo(this.lastJoin); } } } } /** * Goes through all jobs in this.jobs and returns true iff there are leaves (nodes with no children) * * @return */ private boolean needLastJoin() { int leafCount = 0; for (OozieJob job : this.jobs) { if (job.getChildren().isEmpty()) leafCount++; } return leafCount > 1; } private OozieJob createOozieJobObject(AbstractJob job, AbstractWorkflowDataModel wfdm) { final String workflowRunAccession = wfdm.getWorkflow_run_accession(); if (job instanceof BashJob) { return new OozieBashJob(job, JOB_PREFIX + workflowRunAccession + "_" + job.getAlgo() + "_" + this.jobs.size(), this.uniqueWorkingDir, this.useSge, this.seqwareJar, this.threadsSgeParamFormat, this.maxMemorySgeParamFormat, this.stringTruncator); } else if (job instanceof JobBatch) { BatchedOozieBashJob batchJob = new BatchedOozieBashJob(job, JOB_PREFIX + workflowRunAccession + "_" + job.getAlgo() + "_" + this.jobs.size(), this.uniqueWorkingDir, this.useSge, this.seqwareJar, this.threadsSgeParamFormat, this.maxMemorySgeParamFormat, this.stringTruncator); for (Job bJob : ((JobBatch) job).getJobList()) { if (bJob instanceof BashJob) { BashJob nobJob = (BashJob) bJob; OozieBashJob obJob = new OozieBashJob(nobJob, JOB_PREFIX + workflowRunAccession + "_" + nobJob.getAlgo(), this.uniqueWorkingDir, this.useSge, this.seqwareJar, this.threadsSgeParamFormat, this.maxMemorySgeParamFormat, this.stringTruncator); batchJob.attachJob(obJob); } else { throw new UnsupportedOperationException(); } } return batchJob; } else { throw new UnsupportedOperationException("No oozie support for job type " + job.getClass()); } } private OozieJob getOozieJobObject(AbstractJob job) { for (OozieJob pjob : this.jobs) { if (job.equals(pjob.getJobObject())) return pjob; } return null; } /** * Looks like this reorganizes the graph into a graph that is compatible with Oozie. * * This seems to ensure that job names are unique on each level of the dag * * @param root * @return */ private List<List<OozieJob>> reOrganizeGraph(OozieJob root) { List<List<OozieJob>> newGraph = new ArrayList<>(); // to avoid duplicated action Set<String> jobName = new HashSet<>(); // add the root List<OozieJob> rootList = new ArrayList<>(); rootList.add(root); newGraph.add(rootList); jobName.add(root.getLongName()); this.getNextLevel(newGraph, jobName); return newGraph; } private void getNextLevel(List<List<OozieJob>> graph, Set<String> existingJob) { List<OozieJob> lastLevel = graph.get(graph.size() - 1); List<OozieJob> nextLevel = new ArrayList<>(); Set<OozieJob> removed = new HashSet<>(); for (OozieJob job : lastLevel) { for (OozieJob child : job.getChildren()) { if (!nextLevel.contains(child)) nextLevel.add(child); // remove it from the upper level if (existingJob.contains(child.getLongName())) { removed.add(child); } existingJob.add(child.getLongName()); } } if (!removed.isEmpty()) { for (OozieJob rm : removed) { for (List<OozieJob> level : graph) { if (level.contains(rm)) { level.remove(rm); } } } } if (!nextLevel.isEmpty()) { graph.add(nextLevel); getNextLevel(graph, existingJob); } } /** * Given a graph, duplicate all parent accession files from parents to their children. * * @param parent */ private void setAccessionFileRelations(OozieJob parent) { this.setAccessionFileRelations(parent, 0); } /** * Helper method that duplicates parent accession files from parents to their children. * * @param parent * @param level */ private void setAccessionFileRelations(OozieJob parent, int level) { Log.debug(level + ": SETTING ACCESSIONS FOR CHILDREN FOR PARENT JOB " + parent.getLongName()); for (OozieJob pjob : parent.getChildren()) { Log.debug(level + ": RECURSIVE SETTING ACCESSIONS FOR CHILDOB " + pjob.getLongName()); boolean added = pjob.addParentAccessionFile(parent.getAccessionFile().toArray(new String[parent.getAccessionFile().size()])); Log.debug(level + ": Added success: " + added); if (!added) { // if no parent accession file was added, then recursive calls beyond this level // of recursion should be unnecessary and can be ignored // this takes a substantial amount of time beyond five large forks in the workflow continue; } // FIXME: there is some (potentially very serious) bug here were loops // exist in the processing output provision parent/child relationships! // if (!pjob.getChildren().contains(parent)) { // setAccessionFileRelations(pjob); } // don't bother calling this when it has already been called setAccessionFileRelations(pjob, level + 1); } } /** * all leaves (nodes that are not provision outs with no children) become parents of all provision outs. */ private void linkLeafsAsProvisionOutParents() { // add all provision out job // get all the leaf job List<OozieJob> leaves = new ArrayList<>(); for (OozieJob job : this.jobs) { // Note: the leaves accumulated are to be parents of output provisions, // thus the leaves themselves should not be file provisions if ((job instanceof OozieProvisionFileJob == false && job instanceof BatchedOozieProvisionFileJob == false) && job.getChildren().isEmpty()) { leaves.add(job); } } for (Map.Entry<SqwFile, OozieJob> entry : fileJobMap.entrySet()) { if (entry.getKey().isOutput()) { // set parents to all leaf jobs for (OozieJob leaf : leaves) { entry.getValue().addParent(leaf); } } } } /** * Create a bucket generator if we require buckets * * @return */ private BucketGenerator isRequireBuckets(final Collection<SqwFile> files, final boolean input, final String uniqueName, String workflowRunAccession) { // only use buckets for SGE for now if (!this.useSge) { return null; } int numFiles = countInputFiles(files); if (!input) { numFiles = files.size() - numFiles; } Log.debug(numFiles + " counted as unattached input files"); BucketGenerator inputBucketGenerator = null; if (numFiles > THRESHOLD) { Log.debug(numFiles + " above threshold of " + THRESHOLD + "using batching"); inputBucketGenerator = new BucketGenerator(input, uniqueName, workflowRunAccession); } return inputBucketGenerator; } /** * Handles provision file events that are not attached to individual jobs * * @param wfdm * @param metadatawriteback * @param workflowRunAccession * @param parents * a list of all provision in jobs * @param abstractRootJob */ private void handleUnattachedProvisionFileEvents(final AbstractWorkflowDataModel wfdm, final boolean metadatawriteback, final String workflowRunAccession, Set<OozieJob> parents, final AbstractJob abstractRootJob) { BucketGenerator inputBucketGenerator = isRequireBuckets(wfdm.getFiles().values(), true, "unattached", workflowRunAccession); boolean useInputBatches = inputBucketGenerator != null; BucketGenerator outputBucketGenerator = isRequireBuckets(wfdm.getFiles().values(), false, "unattached", workflowRunAccession); boolean useOutputBatches = outputBucketGenerator != null; // this section handles all provision files events that are not attached to specific jobs if (!wfdm.getFiles().isEmpty()) { Set<OozieJob> newParents = new LinkedHashSet<>(); for (Map.Entry<String, SqwFile> entry : wfdm.getFiles().entrySet()) { AbstractJob abstractProvisionXJob = new BashJob("pf" + (entry.getValue().isInput() ? "i" : "o") + "_" + entry.getKey().replaceAll("\\.", "_")); abstractProvisionXJob.getFiles().add(entry.getValue()); OozieProvisionFileJob oozieProvisionXJob = new OozieProvisionFileJob(abstractProvisionXJob, entry.getValue(), JOB_PREFIX + workflowRunAccession + "_" + abstractProvisionXJob.getAlgo() + "_" + this.jobs.size(), this.uniqueWorkingDir, this.useSge, this.seqwareJar, this.threadsSgeParamFormat, this.maxMemorySgeParamFormat, this.stringTruncator); oozieProvisionXJob.setMetadataWriteback(metadatawriteback); if (workflowRunAccession != null && !workflowRunAccession.isEmpty()) { oozieProvisionXJob.setWorkflowRunAccession(workflowRunAccession); } // SEQWARE-1804 transfer setParentAccessions information ala Pegasus version in // net.sourceforge.seqware.pipeline.workflowV2.engine.pegasus.object.Adag if (!entry.getValue().getParentAccessions().isEmpty()) { oozieProvisionXJob.setParentAccessions(entry.getValue().getParentAccessions()); } if (!useInputBatches && entry.getValue().isInput() || !useOutputBatches && entry.getValue().isOutput()) { this.jobs.add(oozieProvisionXJob); this.fileJobMap.put(entry.getValue(), oozieProvisionXJob); } else if (useInputBatches || useOutputBatches) { if (entry.getValue().isInput()) { assert (inputBucketGenerator != null); BatchedOozieProvisionFileJob currentInBucket = inputBucketGenerator.attachAndIterateBuckets(oozieProvisionXJob); if (!this.jobs.contains(currentInBucket)) { this.jobs.add(currentInBucket); } this.fileJobMap.put(entry.getValue(), currentInBucket); } else { assert (outputBucketGenerator != null); BatchedOozieProvisionFileJob currentOutBucket = outputBucketGenerator.attachAndIterateBuckets(oozieProvisionXJob); if (!this.jobs.contains(currentOutBucket)) { this.jobs.add(currentOutBucket); } this.fileJobMap.put(entry.getValue(), currentOutBucket); } } else { throw new RuntimeException("Invalid state for unattached bucket generation"); } // handle in if (entry.getValue().isInput()) { OozieJob target; if (useInputBatches) { assert (inputBucketGenerator != null); target = inputBucketGenerator.getCurrentBucket(); } else { target = oozieProvisionXJob; } newParents.add(target); for (OozieJob parent : parents) { target.addParent(parent); } // add mkdir to the first job, then set the file path String outputDir = this.uniqueWorkingDir + "/provisionfiles/" + entry.getValue().getUniqueDir(); abstractRootJob.getCommand().addArgument("mkdir -p " + outputDir + "; "); oozieProvisionXJob.setOutputDir(outputDir); } else { oozieProvisionXJob.setMetadataOutputPrefix(wfdm.getMetadata_output_file_prefix()); oozieProvisionXJob.setOutputDir(wfdm.getMetadata_output_dir()); // set the filepath } } // reset parents parents.clear(); parents.addAll(newParents); } } /** * Handles the creation of provision file events for a specific job. * * @param job * @param oozieRootJob * @param metadatawriteback * @param workflowRunAccession * @param oozieActualJob * @param abstractRootJob * @param wfdm */ private void handleAttachedProvisionFileEventsForJob(final AbstractJob job, final OozieJob oozieRootJob, final boolean metadatawriteback, final String workflowRunAccession, OozieJob oozieActualJob, AbstractJob abstractRootJob, final AbstractWorkflowDataModel wfdm) { // has provisionfiles dependency? // this based on the assumption that the provisionFiles job is always in // the before or after the actual job if (job.getFiles().isEmpty() == false) { BucketGenerator inputBucketGenerator = isRequireBuckets(job.getFiles(), true, job.getAlgo() + "_" + jobs.size(), workflowRunAccession); boolean useInputBatches = inputBucketGenerator != null; BucketGenerator outputBucketGenerator = isRequireBuckets(job.getFiles(), false, job.getAlgo() + "_" + jobs.size(), workflowRunAccession); boolean useOutputBatches = outputBucketGenerator != null; for (SqwFile file : job.getFiles()) { // create a provisionfile job if (file.isInput()) { // create a provisionFileJob; AbstractJob abstractProvisionInJob = new BashJob("pfi"); abstractProvisionInJob.getFiles().add(file); OozieProvisionFileJob oozieProvisionInJob = new OozieProvisionFileJob(abstractProvisionInJob, file, JOB_PREFIX + workflowRunAccession + "_" + abstractProvisionInJob.getAlgo() + "_" + jobs.size(), this.uniqueWorkingDir, this.useSge, this.seqwareJar, this.threadsSgeParamFormat, this.maxMemorySgeParamFormat, this.stringTruncator); oozieProvisionInJob.setMetadataWriteback(metadatawriteback); if (workflowRunAccession != null && !workflowRunAccession.isEmpty()) { oozieProvisionInJob.setWorkflowRunAccession(workflowRunAccession); } // SEQWARE-1804 transfer setParentAccessions information ala Pegasus version in // net.sourceforge.seqware.pipeline.workflowV2.engine.pegasus.object.Adag if (!file.getParentAccessions().isEmpty()) { oozieProvisionInJob.setParentAccessions(file.getParentAccessions()); } oozieProvisionInJob.setOutputDir("provisionfiles/" + file.getUniqueDir()); // add mkdir to the first job, then set the file path abstractRootJob.getCommand().addArgument("mkdir -p " + "provisionfiles/" + file.getUniqueDir() + "; "); if (!useInputBatches) { // hook up the graph oozieProvisionInJob.addParent(oozieRootJob); this.jobs.add(oozieProvisionInJob); // perform the work after the provision in finishes oozieActualJob.addParent(oozieProvisionInJob); } else { assert (inputBucketGenerator != null); BatchedOozieProvisionFileJob currentInBucket = inputBucketGenerator.attachAndIterateBuckets(oozieProvisionInJob); currentInBucket.addParent(oozieActualJob); if (!this.jobs.contains(currentInBucket)) { this.jobs.add(currentInBucket); } } } else { // create a provisionFileJob; AbstractJob abstractProvisionOutJob = new BashJob("pfo"); abstractProvisionOutJob.getFiles().add(file); OozieProvisionFileJob oozieProvisionOutJob = new OozieProvisionFileJob(abstractProvisionOutJob, file, JOB_PREFIX + workflowRunAccession + "_" + abstractProvisionOutJob.getAlgo() + "_" + jobs.size(), this.uniqueWorkingDir, this.useSge, this.seqwareJar, this.threadsSgeParamFormat, this.maxMemorySgeParamFormat, this.stringTruncator); oozieProvisionOutJob.setMetadataWriteback(metadatawriteback); oozieProvisionOutJob.setMetadataOutputPrefix(wfdm.getMetadata_output_file_prefix()); oozieProvisionOutJob.setOutputDir(wfdm.getMetadata_output_dir()); if (workflowRunAccession != null && !workflowRunAccession.isEmpty()) { oozieProvisionOutJob.setWorkflowRunAccession(workflowRunAccession); } // SEQWARE-1804 transfer setParentAccessions information ala Pegasus version in // net.sourceforge.seqware.pipeline.workflowV2.engine.pegasus.object.Adag if (!file.getParentAccessions().isEmpty()) { oozieProvisionOutJob.setParentAccessions(file.getParentAccessions()); } if (!useOutputBatches) { // hook up the graph oozieProvisionOutJob.addParent(oozieActualJob); this.jobs.add(oozieProvisionOutJob); } else { assert (outputBucketGenerator != null); BatchedOozieProvisionFileJob currentOutBucket = outputBucketGenerator.attachAndIterateBuckets(oozieProvisionOutJob); currentOutBucket.addParent(oozieActualJob); if (!this.jobs.contains(currentOutBucket)) { this.jobs.add(currentOutBucket); } } } } } } private static int countInputFiles(Collection<SqwFile> files) { int count = 0; for (SqwFile file : files) { if (file.isInput()) { count++; } } return count; } /** * Encapsulates logic for the generation and iteration of buckets */ private class BucketGenerator { public int currentBucketCount = 0; private BatchedOozieProvisionFileJob currentBucket; private final boolean input; private final String uniqueName; private final String workflowRunAccession; public BucketGenerator(boolean input, String uniqueName, String workflowRunAccession) { this.input = input; this.uniqueName = uniqueName; this.workflowRunAccession = workflowRunAccession; currentBucket = createBucket(); } /** * Creates a bucket and adds it to the list of jobs * * @return */ private BatchedOozieProvisionFileJob createBucket() { String name = JOB_PREFIX + workflowRunAccession + "_pf" + (input ? "i" : "o") + "_" + uniqueName + "_b" + currentBucketCount; currentBucketCount += BUCKET_SIZE; AbstractJob abstractBucketJob = new BashJob(name); currentBucket = new BatchedOozieProvisionFileJob(abstractBucketJob, name, WorkflowApp.this.uniqueWorkingDir, WorkflowApp.this.useSge, WorkflowApp.this.seqwareJar, WorkflowApp.this.threadsSgeParamFormat, WorkflowApp.this.maxMemorySgeParamFormat, WorkflowApp.this.stringTruncator); return getCurrentBucket(); } /** * Creates new buckets as needed when provisionjobs are added * * @param provisionJob * @return */ public BatchedOozieProvisionFileJob attachAndIterateBuckets(OozieProvisionFileJob provisionJob) { if (getCurrentBucket().getBatchSize() == BUCKET_SIZE) { currentBucket = createBucket(); } getCurrentBucket().attachProvisionFileJob(provisionJob); return getCurrentBucket(); } /** * @return the currentBucket */ public BatchedOozieProvisionFileJob getCurrentBucket() { return currentBucket; } public int getBatchedJobsCount() { return currentBucketCount + currentBucket.getBatchSize(); } } }