package org.molgenis.compute.host; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.UUID; import org.molgenis.util.SimpleTuple; import org.molgenis.util.SshResult; import org.testng.log4testng.Logger; public class Pbs extends AbstractComputeHost implements ComputeHost { private Logger logger = Logger.getLogger(Pbs.class); /** * Construct a PBS connector over ssh using host, user, password, and port * * @param host * @param user * @param password * @param port * @throws IOException */ public Pbs(String host, String user, String password, int port) throws IOException { super(host, user, password, port); } public Pbs(String host, String user, String password) throws IOException { super(host, user, password, 22); } /** Submit a configure job with manually entered settings; returns the job */ @Override public void submit(Job job) throws IOException { String path = getWorkingDir() + ("".equals(this.getWorkingDir()) ? "" : "/") + job.getName(); // set error paths relative to job dir job.setOutput_path(path + ".out"); job.setError_path(path + ".err"); // set defaults if (job.getWalltime() == null) job.setWalltime("00:30:00"); if (job.getMem() == null) job.setMem("2gb"); if (job.getNodes() == null) job.setNodes("1:ppn=1"); // create the PBS headers String script = "#!/bin/bash\n"; // if no name set, create one if (job.getName() == null) job.setName(UUID.randomUUID().toString().replace("-", "")); script += "#PBS -N " + job.getName() + "\n"; if (job.getNodes() != null) script += "#PBS -l nodes=" + job.getNodes() + "\n"; if (job.getQueue() != null) script += "#PBS -q " + job.getQueue() + "\n"; if (job.getWalltime() != null) script += "#PBS -l walltime=" + job.getWalltime() + "\n"; if (job.getOutput_path() != null) script += "#PBS -o " + job.getOutput_path() + "\n"; if (job.getError_path() != null) script += "#PBS -e " + job.getError_path() + "\n"; script += "\n\n"; script += job.getScript(); logger.debug("submitting script:\n" + script); // random filename String filename = job.getName() + ".sh"; logger.debug("uploading script as file: " + filename); this.uploadStringToFile(script, filename, getWorkingDir()); // start the script SshResult result = this.executeCommand("qsub " + path + ".sh"); // check for errors in submission if (result.getStdErr() != null && !result.getStdErr().trim().equals("")) throw new IOException( result.getStdErr()); // return the id (e.g. for dependencies) String id = result.getStdOut().trim(); job.setId(id); this.jobs.put(id, job); } /** * Remove the job remotely and remove from local list of jobs. * * @param job * @throws IOException */ @Override public void remove(Job job) throws IOException { // kill on cluster this.executeCommand("qdel " + job.getId()); // remove job from 'jobs' this.jobs.remove(job.getName()); } @Override public void refresh(Job job) throws IOException { if (JobState.COMPLETED != job.getState()) try { // retrieve the state SshResult pbsOutput = executeCommand("qstat -f -1 " + job.getId()); if (pbsOutput.getStdErr() != null && !pbsOutput.getStdErr().trim().equals("")) { if (pbsOutput.getStdErr().contains("Unknown Job Id")) { job.setState(JobState.WAITING_FOR_LOGS); } } else { this.parse(job, pbsOutput.getStdOut()); } // try retrieve logs if (JobState.WAITING_FOR_LOGS == job.getState()) { Thread.sleep(500); // retrieve the output log, we need to strip host information // (i.e. stuff before ":") job.setOutput_log(downloadFile(job.getOutput_path() .substring(job.getOutput_path().lastIndexOf(":") + 1))); Thread.sleep(500); // retrieve the error log, we need to strip host information // (i.e. stuff before ":") job.setError_log(downloadFile(job.getError_path().substring(job.getError_path().lastIndexOf(":") + 1))); job.setState(JobState.COMPLETED); } } catch (Exception e) { e.printStackTrace(); } } // protected void refresh(String statusString) // { // // default state to error // this.state = JobState.ERROR; // // // parse the string // String[] result = statusString.split("\n"); // for (String res : result) // { // if (res.contains("job_state")) // { // String job_state = res.split("=")[1].trim(); // this.state = JobState.fromString(job_state); // } // // if (res.contains("exec_host")) // { // exec_host = res.split("=")[1].trim(); // } // // if (res.contains("qtime")) // { // qtime = res.split("=")[1].trim(); // } // // } // } public List<Job> getQstat() throws IOException { // retrieve the state SshResult pbsOutput = executeCommand("qstat -f -1 "); // split the log in string per job String[] allLogs = pbsOutput.getStdOut().split("\n\n"); List<Job> result = new ArrayList<Job>(); // set all the properties of the job for (String log : allLogs) { Job job = new Job(); this.parse(job, log); result.add(job); } return result; } private void parse(Job job, String log) { SimpleTuple parse = new SimpleTuple(); if (log == null || "".equals(log)) return; logger.debug("parsing log: " + log); String[] keyValuePairs = log.split("\n"); for (String keyValue : keyValuePairs) { keyValue = keyValue.trim(); String[] split = null; if (keyValue.contains("=")) split = keyValue.split("="); else split = keyValue.split(":"); String key = split[0].trim(); String value = split[1].trim(); parse.set(key, value); } job.setId(parse.getString("Job Id")); job.setName(parse.getString("Job_Name")); job.setState(JobState.fromString(parse.getString("job_state"))); job.setQueue(parse.getString("queue")); // job.setOwner(parse.getString("Job_Owner")); job.setError_path(parse.getString("Error_Path")); job.setOutput_path(parse.getString("Output_Path")); job.setNodes(parse.getString("Resource_List.nodes")); job.setWalltime(parse.getString("Resource_List.walltime")); job.setExec_host(parse.getString("exec_host")); // Job Id: 1910636.millipede.cm.cluster // Job_Name = wikiassoc-en // Job_Owner = s1254871@login01.cm.cluster // job_state = Q // queue = short // server = millipede.cm.cluster // Checkpoint = u // ctime = Mon Jul 4 19:08:33 2011 // Error_Path = login01.cm.cluster:/home/s1254871/wikiassoc-en.e1910636 // Hold_Types = n // Join_Path = n // Keep_Files = n // Mail_Points = abe // Mail_Users = larsmans@gmail.com // mtime = Mon Jul 4 19:08:33 2011 // Output_Path = login01.cm.cluster:/home/s1254871/wikiassoc-en.o1910636 // Priority = 0 // qtime = Mon Jul 4 19:08:33 2011 // Rerunable = True // Resource_List.nodect = 1 // Resource_List.nodes = 1:ppn=24 // Resource_List.walltime = 00:10:00 // etime = Mon Jul 4 19:08:33 2011 // submit_args = skl-old.job // fault_tolerant = False // submit_host = login01.cm.cluster // init_work_dir = /home/s1254871 } }