package com.vip.saturn.job.utils;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.exec.CommandLine;
import org.apache.commons.exec.DefaultExecutor;
import org.apache.commons.exec.ExecuteWatchdog;
import org.apache.commons.exec.PumpStreamHandler;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.vip.saturn.job.basic.SaturnConstant;
import com.vip.saturn.job.executor.SaturnExecutorsNode;
import com.vip.saturn.job.internal.config.ConfigurationNode;
import com.vip.saturn.job.internal.execution.ExecutionNode;
import com.vip.saturn.job.internal.storage.JobNodePath;
import com.vip.saturn.job.reg.base.CoordinatorRegistryCenter;
/**
* 用于处理Shell的相关pid功能
*
* @author linzhaoming
*
*/
public class ScriptPidUtils {
static Logger log = LoggerFactory.getLogger(ScriptPidUtils.class);
public static long UNKNOWN_PID = -1;
/** 系统分隔符 */
protected static final String FILESEPARATOR = System.getProperty("file.separator");
/**
* Saturn的运行目录
* <p>${HOME}/.saturn/executing
*/
public static final String EXECUTINGPATH = System.getProperty("user.home") + FILESEPARATOR + ".saturn" + FILESEPARATOR + "executing";
/**
* Saturn的运行目录
* <p>${HOME}/.saturn/output
*/
public static final String OUTPUT_PATH = System.getProperty("user.home") + FILESEPARATOR + ".saturn" + FILESEPARATOR + "output";
/** 作业执行的运行目录
* <p>目录: ${HOME}/.saturn/executing/[executorName]/[jobName]
* */
public static final String EXECUTINGJOBPATH = EXECUTINGPATH + FILESEPARATOR + "%s" + FILESEPARATOR + "%s";
/** 作业执行的Pid文件
* <p>目录: ${HOME}/.saturn/executing/[executorName]/[jobName]/[jobItem]/PID
* */
public static final String JOBITEMPIDSPATH = EXECUTINGJOBPATH + FILESEPARATOR + "%s" + FILESEPARATOR + "PIDS";
public static final String JOBITEMPATH = EXECUTINGJOBPATH + FILESEPARATOR + "%s";
public static final String JOBITEMPIDPATH2 = EXECUTINGJOBPATH + FILESEPARATOR + "%s" + FILESEPARATOR + "PIDS" + FILESEPARATOR +"%s";
/** Shell作业执行的回写结果路径文件
* <p>目录: ${HOME}/.saturn/output/[executorName]/[jobName]/[jobItem]/[randomId/messageId]/[timestamp]
* */
public static final String JOBITEMOUTPUTPATH = OUTPUT_PATH + FILESEPARATOR + "%s" + FILESEPARATOR + "%s" + FILESEPARATOR + "%s" + FILESEPARATOR + "%s" + FILESEPARATOR + "%s";
private static final String CHECK_RUNNING_JOB_THREAD_NAME = "check-if-job-%s-done";
/**
* 获取当前Saturn的执行目录(executing)
* @return Saturn的执行目录
*/
public static File getSaturnExecutingHome() {
File executingHome = new File(EXECUTINGPATH);
try {
FileUtils.forceMkdir(executingHome);
} catch (Exception ex) {
log.error("msg=Creating directory error", ex);
}
if (executingHome.exists() && executingHome.isDirectory()) {
return executingHome;
} else {
return null;
}
}
/**
* 写入对应的作业分片的pid文件
* @param executorName Executor name
* @param jobName 作业名字
* @param jobItem 作业分片
* @param pid 进程pid
*/
public static void writePidToFile(String executorName, String jobName, int jobItem, long pid) {
String dir = String.format(JOBITEMPIDSPATH, executorName, jobName, jobItem);
String path = String.format(JOBITEMPIDPATH2, executorName, jobName, jobItem, pid);
try {
FileUtils.forceMkdir(new File(dir));
File itemFile = new File(path);
FileUtils.writeStringToFile(itemFile, String.valueOf(pid));
} catch (IOException e) {
log.error(String.format(SaturnConstant.ERROR_LOG_FORMAT, jobName, "Writing the pid file error"), e);
}
}
/**
* 仅用于兼容旧版,获取 PID
* @param executorName
* @param jobName
* @param jobItem
* @return
*/
@Deprecated
public static long _getPidFromFile(String executorName, String jobName, String jobItem) {
String path = String.format(JOBITEMPATH, executorName, jobName, jobItem);
File itemFile = new File(path);
if (!itemFile.exists() || !itemFile.isFile()) {
return UNKNOWN_PID;
}
try {
String pid = FileUtils.readFileToString(itemFile);
try {
return Long.parseLong(pid);
} catch (NumberFormatException e) {
log.error(String.format(SaturnConstant.ERROR_LOG_FORMAT, jobName, "Parsing the pid file error"), e);
return UNKNOWN_PID;
}
} catch (IOException e) {
log.error(String.format(SaturnConstant.ERROR_LOG_FORMAT, jobName, "Reading the pid file error"), e);
return UNKNOWN_PID;
}
}
public static long getFirstPidFromFile(String executorName, String jobName, String jobItem) {
List<Long> pids = getPidsFromFile(executorName, jobName, jobItem);
if(pids.isEmpty()){
return UNKNOWN_PID;
}
return pids.get(0);
}
/**
* 获取对应作业分片的pid, -1表示不存在或读取出错
* @param executorName Executor Name
* @param jobName 作业名
* @param jobItem 作业分片
* @return pid
*/
public static List<Long> getPidsFromFile(String executorName, String jobName, String jobItem) {
List<Long> pids = new ArrayList<Long>();
//兼容旧版PID目录
Long pid = _getPidFromFile(executorName, jobName, jobItem);
if(pid > 0){
pids.add(pid);
}
String path = String.format(JOBITEMPIDSPATH, executorName, jobName, jobItem);
File dir = new File(path);
if (!dir.exists() || !dir.isDirectory()) {
return pids;
}
File[] files = dir.listFiles();
if(files == null || files.length == 0){
return pids;
}
for(File file:files){
try {
pids.add(Long.parseLong(file.getName()));
} catch (Exception e) {
log.error(String.format(SaturnConstant.ERROR_LOG_FORMAT, jobName, "Parsing the pid file error"), e);
}
}
return pids;
}
/**
* 获取对应作业的分片pid文件列表
* @param executorName Executor Name
* @param jobName 作业名
* @return pid文件列表
*/
public static String[] getItemsPaths(String executorName, String jobName) {
String jobNamePath = String.format(EXECUTINGJOBPATH, executorName, jobName);
File jobNameFile = new File(jobNamePath);
if (!jobNameFile.exists() || jobNameFile.isFile()) {
return new String[0];
}
File[] files = jobNameFile.listFiles();
if(files == null || files.length == 0){
return new String[]{};
}
String[] filePaths = new String[files.length];
int i=0;
for(File file:files){
filePaths[i++] = file.getAbsolutePath();
}
return filePaths;
}
/**
* 删除作业分片的全部pid文件
* @param executorName Executor Name
* @param jobName 作业名
* @param jobItem 作业分片
* @return 删除是否成功
*/
public static boolean removeAllPidFile(String executorName, String jobName, String jobItem) {
String path = String.format(JOBITEMPATH, executorName, jobName, jobItem);
File itemFile = new File(path);
if (!itemFile.exists()) {
return false;
}
try {
FileUtils.forceDelete(itemFile);
} catch (IOException e) {
log.error(e.getMessage(),e);
}
return true;
}
public static boolean removePidFile(String executorName, String jobName, String jobItem , long pid) {
String path = String.format(JOBITEMPIDPATH2, executorName, jobName, jobItem, pid);
File itemFile = new File(path);
if (!itemFile.exists()) {
return false;
}
try {
FileUtils.forceDelete(itemFile);
} catch (IOException e) {
log.error(e.getMessage(),e);
}
return true;
}
public static boolean removeAllPidFile(String executorName, String jobName, int jobItem) {
return removeAllPidFile(executorName, jobName, ""+Integer.toString(jobItem) );
}
/**
* This method will kill all the child/grandchild/... processes.
* @param pid pid to kill.
*/
public static void killAllChildrenByPid(long pid, boolean force) throws InterruptedException {
if(pid <= UNKNOWN_PID){
return;
}
String pidStr = Long.toString(pid) + "";
List<String> pidList = new ArrayList<>();
pidList.add(pidStr);
while ( null != (pidStr = exeCmdWithoutPipe(CommandLine.parse("pgrep -P " + pidStr), null, null))) {
String[] pids = pidStr.split(System.getProperty("line.separator"));
for (int i = 0; i < pids.length; i++) {
pidList.add(pids[i]);//NOSONAR
}
pidStr = StringUtils.join(pids,",");
}
// make sure kill the son before kill the parent.
for (int i = pidList.size() - 1; i >=0; i--) {
String ppid = pidList.get(i);
if(!isPidRunning(ppid)){
continue;
}
if (force) {
exeWholeCmd("kill -9 " +ppid);
} else {
exeWholeCmd("kill " + ppid );
}
}
}
public static String exeWholeCmd(String cmd) {
// Common apache exec doesn't support piple operation.
// It's the shell (e.g. bash) that interprets the pipe and does special processing when you type that commandline into the shell.
// But we could use a ByteArrayInputStream to feed the outuput of one command to another.
if (cmd.contains("|")) {
String[] cmds = cmd.split("\\|");
String out = null;
for (int i = 0; i < cmds.length; i++) {
CommandLine cmdLine = CommandLine.parse(cmds[i]);
if (i == 0) {
out = exeCmdWithoutPipe(cmdLine, null, loadEnv());
}
if(out != null){
out = exeCmdWithoutPipe(cmdLine, new ByteArrayInputStream(out.getBytes(Charset.forName("utf-8"))), loadEnv());
}
}
return out;
} else {
CommandLine cmdLine = CommandLine.parse(cmd);
return exeCmdWithoutPipe(cmdLine, null, loadEnv());
}
}
public static String exeCmdWithoutPipe(CommandLine cmdLine, ByteArrayInputStream input, Map<String,String> env) {
DefaultExecutor executor = new DefaultExecutor();
ExecuteWatchdog dog = new ExecuteWatchdog(3 * 1000);
executor.setWatchdog(dog);
executor.setExitValue(0);
try {
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
ByteArrayOutputStream errorStream = new ByteArrayOutputStream();
PumpStreamHandler streamHandler = new PumpStreamHandler(outputStream,errorStream,input);
executor.setStreamHandler(streamHandler);
int value = executor.execute(cmdLine, env);
if (value == 0) {
String out = outputStream.toString();
return out;
} else {
return null;
}
} catch (Exception e) {
log.error("msg=" + e.getMessage(), e);
return null;
}
}
/**
* 检查是否已经已有作业名重复运行
* @param zkJobNames 作业名列表
*/
public static void checkAllExistJobs(final CoordinatorRegistryCenter regCenter, final List<String> zkJobNames) {
if (zkJobNames == null || zkJobNames.size() == 0) {
return;
}
for (final String jobName : zkJobNames) {
checkOneExistJob(regCenter, jobName);
}
}
public static void forceStopRunningShellJob(final String executorName, final String jobName) {
String[] pidFromFile = ScriptPidUtils.getItemsPaths(executorName, jobName);
if(pidFromFile == null || pidFromFile.length == 0){
log.info("[{}] msg={} no pids to kill", jobName, jobName);
return;
}
for (String path : pidFromFile) {
String itemStr = StringUtils.substringAfterLast(path, File.separator);
int jobItem = Integer.parseInt(itemStr);
List<Long> pids = ScriptPidUtils.getPidsFromFile(executorName, jobName, ""+Integer.toString(jobItem) );
for(Long pid : pids){
if (pid > 0 && ScriptPidUtils.isPidRunning(""+pid)) {
try {
ScriptPidUtils.killAllChildrenByPid(pid, true);
} catch (InterruptedException e) {
log.error(String.format(SaturnConstant.ERROR_LOG_FORMAT, jobName, e.getMessage()), e);
}
}
}
ScriptPidUtils.removeAllPidFile(executorName,jobName, jobItem);
}
}
public static void checkOneExistJob(final CoordinatorRegistryCenter regCenter, final String jobName) {
final String executorName = regCenter.getExecutorName();
String[] itemPaths = ScriptPidUtils.getItemsPaths(executorName, jobName);
if(itemPaths == null || itemPaths.length == 0){
return;
}
String jobTypePath = JobNodePath.getNodeFullPath(jobName, ConfigurationNode.JOB_TYPE);
String jobType = regCenter.get(jobTypePath);
if(!"SHELL_JOB".equals(jobType)){
log.info("{} is not shell job ,igore checking ", jobName);
return;
}
String enabledPath = JobNodePath.getNodeFullPath(jobName, ConfigurationNode.ENABLED);
String isEnabledStr = regCenter.get(enabledPath);
log.info("[{}] msg={} pidFromFile size :{};isEnabledStr:{}",jobName,jobName,itemPaths.length,isEnabledStr);
// null means new job, if there are pid files, kill -9.
// if it's true, means it's an enabled job, there shouldn't exist the pid files. kill them with no mercy.
if ("true".equals(isEnabledStr) || isEnabledStr == null) {
killRunningShellProcess(executorName, jobName, itemPaths);
}
else{
// if there are other executors, failover will occure. This executor only has to kill the pids.
if (areThereOtherExecutorsRunningTheShards(regCenter, jobName)) {
killRunningShellProcess(executorName, jobName, itemPaths);
} else {
// enabled job with pid files existed and no other executors, means that the job is exited improperly.
// under this situation, we need to restore the running job status.
final List<String> shardItems = new ArrayList();
for (String path : itemPaths) {
String itemStr = StringUtils.substringAfterLast(path, File.separator);
int jobItem = Integer.parseInt(itemStr);
long pid = ScriptPidUtils.getFirstPidFromFile(executorName, jobName,""+ Integer.toString(jobItem));
if (pid > 0 && ScriptPidUtils.isPidRunning(""+Long.toString(pid))) {
String runningPath = JobNodePath.getNodeFullPath(jobName, String.format(ExecutionNode.RUNNING, Integer.parseInt(itemStr)));
regCenter.persistEphemeral(runningPath, "");
log.info("[{}] msg={}-{} restores running status, path={}", jobName, jobName, jobName, path, runningPath);
System.out.println(jobName + "-" + path+ " restores running status, path=" + runningPath);//NOSONAR
shardItems.add(itemStr);
log.info("[{}] msg={}-{} is running, pid={}", jobName, jobName, path, pid);
}else{
ScriptPidUtils.removeAllPidFile(executorName, jobName, itemStr);
log.info("[{}] msg={}-{} is not running, pid={}", jobName, jobName, path, pid);
}
}
if(shardItems.isEmpty()){
return;
}
// start a thread to check if shell process is done, if yes, remove pid file -> add completed -> clear running
new Thread(new Runnable() {
@Override
public void run() {
while (!Thread.interrupted()) {
try {
TimeUnit.MILLISECONDS.sleep(500);
} catch (InterruptedException e) {
}
boolean finished = true;
for(String shardItem:shardItems){
long pid = ScriptPidUtils.getFirstPidFromFile(executorName, jobName, shardItem);
if (pid > 0 && ScriptPidUtils.isPidRunning(""+Long.toString(pid))) {
finished = false;
continue;
} else {
// remove pid file -> add completed -> clear running
// make sure u have added completed node before remove running node. otherwise failover will triggered.
ScriptPidUtils.removeAllPidFile(executorName, jobName, shardItem);
String completedPath = JobNodePath.getNodeFullPath(jobName, String.format(ExecutionNode.COMPLETED, shardItem));
regCenter.persist(completedPath, "");
String runningPath = JobNodePath.getNodeFullPath(jobName, String.format(ExecutionNode.RUNNING, shardItem));
regCenter.remove(runningPath);
log.info("[{}] msg={} - {} is done, write complete node path {}", jobName, jobName, shardItem,completedPath);
System.out.println(jobName + "-" + shardItem + " is done.");//NOSONAR
}
}
if (finished) {
log.info("[{}] msg=all running shell processes are done. now quit the thread.");
System.out.println("all running shell processes are done. now quit the thread.");//NOSONAR
break;
}
}
}
}, String.format(CHECK_RUNNING_JOB_THREAD_NAME, jobName)).start();
}
}
}
private static void killRunningShellProcess(String executorName, String jobName, String[] pidFromFile) {
for (String path : pidFromFile) {
Integer item = Integer.parseInt(StringUtils.substringAfterLast(path, File.separator));
long pid = ScriptPidUtils.getFirstPidFromFile(executorName, jobName, ""+item);
System.out.println("pid found for jobName:" + jobName + " executorName:" + executorName+ ", kill -9 " + pid);//NOSONAR
try {
killAllChildrenByPid(pid, true);
} catch (InterruptedException e) {
log.error(String.format(SaturnConstant.ERROR_LOG_FORMAT, jobName, "killRunningShellProcess interrupted:"), e);
}
ScriptPidUtils.removeAllPidFile(executorName, jobName, item);
}
}
private static boolean areThereOtherExecutorsRunningTheShards(final CoordinatorRegistryCenter regCenter, String jobName) {
final String executorName = regCenter.getExecutorName();
List<String> executors = regCenter.getChildrenKeys(SaturnExecutorsNode.SATURN_EXECUTORS_EXECUTORS_NODE_NAME);
if (executors != null && !executors.isEmpty()) {
for (String executor : executors) {
if (!executorName.equals(executor)) {
// check if this executor has taken care of the failovers.
String sharding = regCenter.get(JobNodePath.getNodeFullPath(jobName, executor + "/sharding"));
if (StringUtils.isNoneBlank(sharding)) {
return true;
}
}
}
}
return false;
}
/* public static boolean isPidRunning(long pid) {
CommandLine cmdLine = CommandLine.parse(String.format(CHECK_PID_CMD, pid));
String outPut = exeCmdWithoutPipe(cmdLine, null, null);
if (StringUtils.isBlank(outPut)) {
return false;
}
return true;
}*/
public static boolean isPidRunning(long pid) {
String path = "/proc/" + pid;
return new File(path).exists();
}
public static boolean isPidRunning(String pid) {
String path = "/proc/" + pid;
return new File(path).exists();
}
public static Map<String, String> parseString2Map(String source) {
Map<String,String> map = new HashMap<>();
String[] lines = source.split(System.getProperty("line.separator"));
for (String oneLine: lines) {
String[] kvs = oneLine.split("=");
if (kvs.length == 2) {
map.put(kvs[0], kvs[1]);
} else if (kvs.length > 2) {
map.put(kvs[0], oneLine.replace(kvs[0] + "=", ""));
}
}
return map;
}
public static Map<String,String> loadEnv() {
Map<String, String> env = new HashMap<>();
try {
final CommandLine cmdLine = new CommandLine("/bin/sh");
cmdLine.addArguments(new String[]{"-c","source /etc/profile && env"}, false);
String output = exeCmdWithoutPipe(cmdLine, null, null);
if(output == null) {
return env;
}
env = parseString2Map(output);
} catch (Exception e) {
log.error("msg=" + e.getMessage(),e);
}
return env;
}
public static String filterEnvInCmdStr(Map<String,String> env, String cmd) {
String patternString = "\\$\\{?(" + StringUtils.join(env.keySet(),"|") + ")\\}?";
Pattern pattern = Pattern.compile(patternString);
Matcher matcher = pattern.matcher(cmd);
StringBuffer sb = new StringBuffer();//NOSONAR
while(matcher.find()) {
matcher.appendReplacement(sb, env.get(matcher.group(1)));
}
matcher.appendTail(sb);
return sb.toString();
}
}