// Copyright 2016 Twitter. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package com.twitter.heron.scheduler.slurm;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import com.twitter.heron.spi.utils.ShellUtils;
public class SlurmController {
private static final Logger LOG = Logger.getLogger(SlurmController.class.getName());
private final boolean isVerbose;
SlurmController(boolean isVerbose) {
this.isVerbose = isVerbose;
}
/**
* Create a slurm job. Use the slurm scheduler's sbatch command to submit the job.
* sbatch allocates the nodes and runs the script specified by slurmScript.
* This script runs the heron executor on each of the nodes allocated.
*
* @param slurmScript slurm bash script to execute
* @param heronExec the heron executable
* @param commandArgs arguments to the heron executor
* @param topologyWorkingDirectory working directory
* @param containers number of containers required to run the topology
* @param partition the queue to submit the job
* @return true if the job creation is successful
*/
public boolean createJob(String slurmScript, String heronExec,
String[] commandArgs, String topologyWorkingDirectory,
long containers, String partition) {
// get the command to run the job on Slurm cluster
List<String> slurmCmd = slurmCommand(slurmScript, heronExec, containers, partition);
// change the empty strings of command args to "", because batch
// doesn't recognize space as an arguments
List<String> transformedArgs = new ArrayList<>();
for (int i = 0; i < commandArgs.length; i++) {
String arg = commandArgs[i];
if (arg == null || arg.trim().equals("")) {
transformedArgs.add("\"\"");
} else {
transformedArgs.add(arg);
}
}
// add the args to the command
slurmCmd.addAll(transformedArgs);
String[] slurmCmdArray = slurmCmd.toArray(new String[0]);
LOG.log(Level.INFO, "Executing job [" + topologyWorkingDirectory + "]:",
Arrays.toString(slurmCmdArray));
StringBuilder stderr = new StringBuilder();
boolean ret = runProcess(topologyWorkingDirectory, slurmCmdArray, stderr);
return ret;
}
/**
* Construct the SLURM Command
* @param slurmScript slurm script name
* @param heronExec heron executable name
* @param containers number of containers
* @param partition the partition to submit the job
* @return list with the command
*/
private List<String> slurmCommand(String slurmScript, String heronExec,
long containers, String partition) {
String nTasks = String.format("--ntasks=%d", containers);
List<String> slurmCmd;
if (partition != null) {
slurmCmd = new ArrayList<>(Arrays.asList("sbatch", "-N",
Long.toString(containers), nTasks, "-p", partition, slurmScript, heronExec));
} else {
slurmCmd = new ArrayList<>(Arrays.asList("sbatch", "-N",
Long.toString(containers), nTasks, slurmScript, heronExec));
}
return slurmCmd;
}
/**
* Create a slurm job. Use the slurm schedule'r sbatch command to submit the job.
* sbatch allocates the nodes and runs the script specified by slurmScript.
* This script runs the heron executor on each of the nodes allocated.
*
* @param slurmScript slurm bash script to execute
* @param heronExec the heron executable
* @param commandArgs arguments to the heron executor
* @param topologyWorkingDirectory working directory
* @param containers number of containers required to run the topology
* @return true if the job creation is successful
*/
public boolean createJob(String slurmScript, String heronExec,
String[] commandArgs, String topologyWorkingDirectory,
long containers) {
return createJob(slurmScript, heronExec, commandArgs,
topologyWorkingDirectory, containers, null);
}
/**
* This is for unit testing
*/
protected boolean runProcess(String topologyWorkingDirectory, String[] slurmCmd,
StringBuilder stderr) {
File file = topologyWorkingDirectory == null ? null : new File(topologyWorkingDirectory);
return 0 == ShellUtils.runSyncProcess(false, slurmCmd, stderr, file);
}
/**
* Cancel the Slurm job by reading the jobid from the jobIdFile. Uses scancel
* command to cancel the job. The file contains a single line with the job id.
* This file is written by the slurm job script after the job is allocated.
* @param jobIdFile the jobId file
* @return true if the job is cancelled successfully
*/
public boolean killJob(String jobIdFile) {
List<String> jobIdFileContent = readFromFile(jobIdFile);
if (jobIdFileContent.size() > 0) {
String[] slurmCmd = new String[]{"scancel", jobIdFileContent.get(0)};
return runProcess(null, slurmCmd, new StringBuilder());
} else {
LOG.log(Level.SEVERE, "Failed to read the Slurm Job id from file: {0}", jobIdFile);
return false;
}
}
/**
* Read all the data from a text file line by line
* For now lets keep this util function here. We need to move it to a util location
* @param filename name of the file
* @return string list containing the lines of the file, if failed to read, return an empty list
*/
protected List<String> readFromFile(String filename) {
Path path = new File(filename).toPath();
List<String> result = new ArrayList<>();
try {
List<String> tempResult = Files.readAllLines(path);
if (tempResult != null) {
result.addAll(tempResult);
}
} catch (IOException e) {
LOG.log(Level.SEVERE, "Failed to read from file. ", e);
}
return result;
}
}