package com.vip.saturn.job.console.service.impl; import java.nio.charset.Charset; import java.text.SimpleDateFormat; import java.util.*; import java.util.Map.Entry; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.ThreadFactory; import java.util.concurrent.TimeUnit; import javax.annotation.PostConstruct; import com.vip.saturn.job.console.utils.*; import org.apache.commons.lang3.StringUtils; import org.apache.curator.framework.CuratorFramework; import org.apache.zookeeper.KeeperException.NoNodeException; import org.apache.zookeeper.data.Stat; import org.slf4j.Logger; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; import com.alibaba.fastjson.JSON; import com.alibaba.fastjson.TypeReference; import com.google.common.base.Strings; import com.vip.saturn.job.console.domain.AbnormalContainer; import com.vip.saturn.job.console.domain.AbnormalJob; import com.vip.saturn.job.console.domain.AbnormalShardingState; import com.vip.saturn.job.console.domain.DomainStatistics; import com.vip.saturn.job.console.domain.ExecutorStatistics; import com.vip.saturn.job.console.domain.JobBriefInfo.JobType; import com.vip.saturn.job.console.domain.JobStatistics; import com.vip.saturn.job.console.domain.RegistryCenterClient; import com.vip.saturn.job.console.domain.RegistryCenterConfiguration; import com.vip.saturn.job.console.domain.Timeout4AlarmJob; import com.vip.saturn.job.console.domain.ZkCluster; import com.vip.saturn.job.console.domain.ZkStatistics; import com.vip.saturn.job.console.domain.container.ContainerConfig; import com.vip.saturn.job.console.domain.container.ContainerScaleJob; import com.vip.saturn.job.console.exception.JobConsoleException; import com.vip.saturn.job.console.mybatis.entity.SaturnStatistics; import com.vip.saturn.job.console.mybatis.service.SaturnStatisticsService; import com.vip.saturn.job.console.repository.zookeeper.CuratorRepository; import com.vip.saturn.job.console.repository.zookeeper.CuratorRepository.CuratorFrameworkOp; import com.vip.saturn.job.console.repository.zookeeper.impl.CuratorRepositoryImpl; import com.vip.saturn.job.console.service.ContainerService; import com.vip.saturn.job.console.service.DashboardService; import com.vip.saturn.job.console.service.JobDimensionService; import com.vip.saturn.job.console.service.RegistryCenterService; import com.vip.saturn.job.console.service.helper.DashboardServiceHelper; import com.vip.saturn.job.console.utils.ConsoleUtil; import com.vip.saturn.job.console.utils.ContainerNodePath; import com.vip.saturn.job.console.utils.ExecutorNodePath; import com.vip.saturn.job.console.utils.JobNodePath; import com.vip.saturn.job.console.utils.ResetCountType; import com.vip.saturn.job.console.utils.SaturnThreadFactory; import com.vip.saturn.job.console.utils.StatisticsTableKeyConstant; /** * @author chembo.huang * */ @Service public class DashboardServiceImpl implements DashboardService { private static final Logger log = org.slf4j.LoggerFactory.getLogger(DashboardServiceImpl.class); public static int REFRESH_INTERVAL_IN_MINUTE = 7; private static int ALLOW_DELAY_MILLIONSECONDS = 60 * 1000 * REFRESH_INTERVAL_IN_MINUTE; public static HashMap<String/** zkBsKey **/, List<AbnormalJob>> UNNORMAL_JOB_LIST_CACHE = new HashMap<>(); public static HashMap<String/** zkBsKey **/, HashMap<String/** {jobname}-{domain} */, JobStatistics>> JOB_MAP_CACHE = new HashMap<>(); public static HashMap<String/** zkBsKey **/, HashMap<String/** {executorName}-{domain} */, ExecutorStatistics>> EXECUTOR_MAP_CACHE = new HashMap<>(); public static HashMap<String/** zkBsKey **/, Integer/** docker executor count */> DOCKER_EXECUTOR_COUNT_MAP = new HashMap<>(); public static HashMap<String/** zkBsKey **/, Integer/** physical executor count */> PHYSICAL_EXECUTOR_COUNT_MAP = new HashMap<>(); public static Map<String/** domainName_jobName_shardingItemStr **/, AbnormalShardingState/** abnormal sharding state */> ABNORMAL_SHARDING_STATE_CACHE = new ConcurrentHashMap<>(); private static ScheduledExecutorService abnormalShardingCacheCleaner = Executors.newScheduledThreadPool(1, new SaturnThreadFactory("AbnormalSharding-Cache-Cleaner"));; @Autowired private SaturnStatisticsService saturnStatisticsService; @Autowired private RegistryCenterService registryCenterService; @Autowired private JobDimensionService jobDimensionService; @Autowired private CuratorRepository curatorRepository; @Autowired private ReportAlarmServiceImpl reportAlarmService; @Autowired private ContainerService containerService; @PostConstruct public void init() throws Exception { if(ConsoleUtil.isDashboardOn()){ startRefreshStatisticsTimmer(); } } static { String refreshInterval = System.getProperty("VIP_SATURN_DASHBOARD_REFRESH_INTERVAL_MINUTE", System.getenv("VIP_SATURN_DASHBOARD_REFRESH_INTERVAL_MINUTE")); if (refreshInterval != null) { try { REFRESH_INTERVAL_IN_MINUTE = Integer.valueOf(refreshInterval); ALLOW_DELAY_MILLIONSECONDS = 60 * 1000 * REFRESH_INTERVAL_IN_MINUTE; } catch (Exception e) { log.error(e.getMessage(), e); } } //启动定时清理分片告警缓存信息线程 abnormalShardingCacheCleaner.scheduleAtFixedRate(new AbnormalShardingCacheCleaner(), 0, ALLOW_DELAY_MILLIONSECONDS, TimeUnit.MILLISECONDS); } private ExecutorService singleThreadExecutor = Executors.newSingleThreadExecutor(new ThreadFactory() { @Override public Thread newThread(Runnable r) { String name = "single-update-satatistics"; Thread t = new Thread(r, name); if (t.isDaemon()) { t.setDaemon(false); } if (t.getPriority() != Thread.NORM_PRIORITY) { t.setPriority(Thread.NORM_PRIORITY); } return t; } }); private TimerTask refreshStatisticsTask() { return new TimerTask() { @Override public void run() { try { Date start = new Date(); log.info("start refresh statistics."); refreshStatistics2DB(); log.info("end refresh statistics, takes " + (new Date().getTime() - start.getTime())); } catch (Exception e) { log.error(e.getMessage(), e); } } }; } private void startRefreshStatisticsTimmer() { Timer timer = new Timer("refresh-statistics-to-db-timmer", true); timer.scheduleAtFixedRate(refreshStatisticsTask(), 1000 * 15 , 1000 * 60 * REFRESH_INTERVAL_IN_MINUTE); } @Override public synchronized void refreshStatistics2DB() { Collection<ZkCluster> zkClusters = RegistryCenterServiceImpl.ZKADDR_TO_ZKCLUSTER_MAP.values(); for (ZkCluster zkCluster : zkClusters) { HashMap<String/** {jobname}-{domain} */, JobStatistics> jobMap = new HashMap<>(); HashMap<String/** {executorName}-{domain} */, ExecutorStatistics> executorMap = new HashMap<>(); List<JobStatistics> jobList = new ArrayList<>(); List<ExecutorStatistics> executorList = new ArrayList<>(); List<AbnormalJob> unnormalJobList = new ArrayList<>(); List<AbnormalJob> unableFailoverJobList = new ArrayList<>(); List<Timeout4AlarmJob> timeout4AlarmJobList = new ArrayList<>(); List<DomainStatistics> domainList = new ArrayList<>(); List<AbnormalContainer> abnormalContainerList = new ArrayList<>(); Map<String, Long> versionDomainNumber = new HashMap<>(); // 不同版本的域数量 Map<String, Long> versionExecutorNumber = new HashMap<>(); // 不同版本的executor数量 int exeInDocker = 0; int exeNotInDocker = 0; int totalCount = 0; int errorCount = 0; for (RegistryCenterConfiguration config : RegistryCenterServiceImpl.ZKADDR_TO_ZKCLUSTER_MAP.get(zkCluster.getZkAddr()).getRegCenterConfList()) { // 过滤非当前zk连接 if (zkCluster.getZkAddr().equals(config.getZkAddressList())) { int processCountOfThisDomainAllTime = 0; int errorCountOfThisDomainAllTime = 0; int processCountOfThisDomainThisDay = 0; int errorCountOfThisDomainThisDay = 0; DomainStatistics domain = new DomainStatistics(config.getNamespace(), zkCluster.getZkAddr(), config.getNameAndNamespace()); RegistryCenterClient registryCenterClient = registryCenterService.connect(config.getNameAndNamespace()); try { if (registryCenterClient != null) { CuratorFramework curatorClient = registryCenterClient.getCuratorClient(); CuratorFrameworkOp curatorFrameworkOp = curatorRepository.newCuratorFrameworkOp(curatorClient); // 统计稳定性 if (checkExists(curatorClient, ExecutorNodePath.SHARDING_COUNT_PATH)) { String countStr = getData(curatorClient, ExecutorNodePath.SHARDING_COUNT_PATH); domain.setShardingCount(Integer.valueOf(countStr)); } String version = null; // 该域的版本号 long executorNumber = 0L; // 该域的在线executor数量 // 统计物理容器资源,统计版本数据 if (null != curatorClient.checkExists().forPath(ExecutorNodePath.getExecutorNodePath())) { List<String> executors = curatorClient.getChildren().forPath(ExecutorNodePath.getExecutorNodePath()); if(executors != null) { for (String exe : executors) { // 在线的才统计 if (null != curatorClient.checkExists().forPath(ExecutorNodePath.getExecutorIpNodePath(exe))) { // 统计是物理机还是容器 String executorMapKey = exe + "-" + config.getNamespace(); ExecutorStatistics executorStatistics = executorMap.get(executorMapKey); if (executorStatistics == null) { executorStatistics = new ExecutorStatistics(exe, config.getNamespace()); executorStatistics.setNns(domain.getNns()); executorStatistics.setIp(getData(curatorClient, ExecutorNodePath.getExecutorIpNodePath(exe))); executorMap.put(executorMapKey, executorStatistics); } // set runInDocker field if (checkExists(curatorClient, ExecutorNodePath.get$ExecutorTaskNodePath(exe))) { executorStatistics.setRunInDocker(true); exeInDocker++; } else { exeNotInDocker++; } } // 获取版本号 if (version == null) { version = getData(curatorClient, ExecutorNodePath.getExecutorVersionNodePath(exe)); } } executorNumber = executors.size(); } } // 统计版本数据 if(version == null) { // 未知版本 version = "-1"; } if(versionDomainNumber.containsKey(version)) { Long domainNumber = versionDomainNumber.get(version); versionDomainNumber.put(version, domainNumber + 1); } else { versionDomainNumber.put(version, 1L); } if(versionExecutorNumber.containsKey(version)) { Long executorNumber0 = versionExecutorNumber.get(version); versionExecutorNumber.put(version, executorNumber0 + executorNumber); } else { if(executorNumber != 0) { versionExecutorNumber.put(version, executorNumber); } } // 遍历所有$Jobs子节点,非系统作业 List<String> jobs = jobDimensionService.getAllUnSystemJobs(curatorFrameworkOp); for (String job : jobs) { try{ Boolean localMode = Boolean.valueOf(getData(curatorClient,JobNodePath.getConfigNodePath(job, "localMode"))); String jobDomainKey = job + "-" + config.getNamespace(); JobStatistics jobStatistics = jobMap.get(jobDomainKey); if (jobStatistics == null) { jobStatistics = new JobStatistics(job, config.getNamespace(),config.getNameAndNamespace()); jobMap.put(jobDomainKey, jobStatistics); } String jobDegree = getData(curatorClient,JobNodePath.getConfigNodePath(job, "jobDegree")); if(Strings.isNullOrEmpty(jobDegree)){ jobDegree = "0"; } jobStatistics.setJobDegree(Integer.parseInt(jobDegree)); // 非本地作业才参与判断 if (!localMode) { AbnormalJob unnormalJob = new AbnormalJob(job, config.getNamespace(), config.getNameAndNamespace(), config.getDegree()); checkJavaOrShellJobHasProblem(curatorClient, unnormalJob, jobDegree, unnormalJobList); } // 查找超时告警作业 Timeout4AlarmJob timeout4AlarmJob = new Timeout4AlarmJob(job, config.getNamespace(), config.getNameAndNamespace(), config.getDegree()); if (isTimeout4AlarmJob(timeout4AlarmJob, curatorFrameworkOp) != null) { timeout4AlarmJob.setJobDegree(jobDegree); timeout4AlarmJobList.add(timeout4AlarmJob); } // 查找无法高可用的作业 AbnormalJob unableFailoverJob = new AbnormalJob(job, config.getNamespace(), config.getNameAndNamespace(), config.getDegree()); if (isUnableFailoverJob(curatorClient, unableFailoverJob,curatorFrameworkOp) != null) { unableFailoverJob.setJobDegree(jobDegree); unableFailoverJobList.add(unableFailoverJob); } String processCountOfThisJobAllTimeStr = getData(curatorClient, JobNodePath.getProcessCountPath(job)); String errorCountOfThisJobAllTimeStr = getData(curatorClient, JobNodePath.getErrorCountPath(job)); int processCountOfThisJobAllTime = processCountOfThisJobAllTimeStr == null?0:Integer.valueOf(processCountOfThisJobAllTimeStr); int errorCountOfThisJobAllTime = processCountOfThisJobAllTimeStr == null?0:Integer.valueOf(errorCountOfThisJobAllTimeStr); processCountOfThisDomainAllTime += processCountOfThisJobAllTime; errorCountOfThisDomainAllTime += errorCountOfThisJobAllTime; int processCountOfThisJobThisDay = 0; int errorCountOfThisJobThisDay = 0; // loadLevel of this job int loadLevel = Integer.parseInt(getData(curatorClient,JobNodePath.getConfigNodePath(job, "loadLevel"))); int shardingTotalCount = Integer.parseInt(getData(curatorClient,JobNodePath.getConfigNodePath(job, "shardingTotalCount"))); List<String> servers = null; if (null != curatorClient.checkExists().forPath(JobNodePath.getServerNodePath(job))) { servers = curatorClient.getChildren().forPath(JobNodePath.getServerNodePath(job)); for (String server:servers) { // 如果结点存活,算两样东西:1.遍历所有servers节点里面的processSuccessCount & processFailureCount,用以统计作业每天的执行次数;2.统计executor的loadLevel;, if (checkExists(curatorClient, JobNodePath.getServerStatus(job, server))) { // 1.遍历所有servers节点里面的processSuccessCount & processFailureCount,用以统计作业每天的执行次数; try { String processSuccessCountOfThisExeStr = getData(curatorClient, JobNodePath.getProcessSucessCount(job, server)); String processFailureCountOfThisExeStr = getData(curatorClient, JobNodePath.getProcessFailureCount(job, server)); int processSuccessCountOfThisExe = processSuccessCountOfThisExeStr == null?0:Integer.valueOf(processSuccessCountOfThisExeStr); int processFailureCountOfThisExe = processFailureCountOfThisExeStr == null?0:Integer.valueOf(processFailureCountOfThisExeStr); // 该作业当天运行统计 processCountOfThisJobThisDay += processSuccessCountOfThisExe + processFailureCountOfThisExe; errorCountOfThisJobThisDay += processFailureCountOfThisExe; // 全部域当天的成功数与失败数 totalCount += processSuccessCountOfThisExe + processFailureCountOfThisExe; errorCount += processFailureCountOfThisExe; // 全域当天运行统计 processCountOfThisDomainThisDay += processCountOfThisJobThisDay; errorCountOfThisDomainThisDay += errorCountOfThisJobThisDay; // executor当天运行成功失败数 String executorMapKey = server + "-" + config.getNamespace(); ExecutorStatistics executorStatistics = executorMap.get(executorMapKey); if (executorStatistics == null) { executorStatistics = new ExecutorStatistics(server, config.getNamespace()); executorStatistics.setNns(domain.getNns()); executorStatistics.setIp(getData(curatorClient, ExecutorNodePath.getExecutorIpNodePath(server))); executorMap.put(executorMapKey, executorStatistics); } executorStatistics.setFailureCountOfTheDay(executorStatistics.getFailureCountOfTheDay() + processFailureCountOfThisExe); executorStatistics.setProcessCountOfTheDay(executorStatistics.getProcessCountOfTheDay() + processSuccessCountOfThisExe + processFailureCountOfThisExe); } catch (Exception e) { log.info(e.getMessage()); } // 2.统计executor的loadLevel; try { // enabled 的作业才需要计算权重 if (Boolean.valueOf(getData(curatorClient, JobNodePath.getConfigNodePath(job, "enabled")))) { String sharding = getData(curatorClient,JobNodePath.getServerSharding(job, server)); if (StringUtils.isNotEmpty(sharding)) { // 更新job的executorsAndshards String exesAndShards = (jobStatistics.getExecutorsAndShards() == null?"":jobStatistics.getExecutorsAndShards()) + server + ":" + sharding + "; "; jobStatistics.setExecutorsAndShards(exesAndShards); // 2.统计是物理机还是容器 String executorMapKey = server + "-" + config.getNamespace(); ExecutorStatistics executorStatistics = executorMap.get(executorMapKey); if (executorStatistics == null) { executorStatistics = new ExecutorStatistics(server, config.getNamespace()); executorStatistics.setNns(domain.getNns()); executorStatistics.setIp(getData(curatorClient, ExecutorNodePath.getExecutorIpNodePath(server))); executorMap.put(executorMapKey, executorStatistics); // set runInDocker field if (checkExists(curatorClient, ExecutorNodePath.get$ExecutorTaskNodePath(server))) { executorStatistics.setRunInDocker(true); exeInDocker ++; } else { exeNotInDocker ++; } } if (executorStatistics.getJobAndShardings() != null) { executorStatistics.setJobAndShardings(executorStatistics.getJobAndShardings() + job + ":" + sharding + ";"); } else { executorStatistics.setJobAndShardings(job + ":" + sharding + ";"); } int newLoad = executorStatistics.getLoadLevel() + (loadLevel * sharding.split(",").length); executorStatistics.setLoadLevel(newLoad); } } } catch (Exception e) { log.info(e.getMessage()); } } } } // local-mode job = server count(regardless server status) if (localMode) { jobStatistics.setTotalLoadLevel(servers == null?0:(servers.size() * loadLevel)); } else { jobStatistics.setTotalLoadLevel(loadLevel * shardingTotalCount); } jobStatistics.setErrorCountOfAllTime(errorCountOfThisJobAllTime); jobStatistics.setProcessCountOfAllTime(processCountOfThisJobAllTime); jobStatistics.setFailureCountOfTheDay(errorCountOfThisJobThisDay); jobStatistics.setProcessCountOfTheDay(processCountOfThisJobThisDay); jobMap.put(jobDomainKey, jobStatistics); }catch(Exception e){ log.info("statistics namespace:{} ,jobName:{} ,exception:{}",domain.getNns(),job,e.getMessage()); } } // 遍历容器资源,获取异常资源 String dcosTasksNodePath = ContainerNodePath.getDcosTasksNodePath(); List<String> tasks = curatorFrameworkOp.getChildren(dcosTasksNodePath); if(tasks != null && !tasks.isEmpty()) { for(String taskId : tasks) { AbnormalContainer abnormalContainer = new AbnormalContainer(taskId, config.getNamespace(), config.getNameAndNamespace(), config.getDegree()); if(isContainerInstanceMismatch(abnormalContainer, curatorFrameworkOp) != null) { abnormalContainerList.add(abnormalContainer); } } } } } catch (Exception e) { log.info("refreshStatistics2DB namespace:{} ,exception:{}",domain.getNns(), e.getMessage()); } domain.setErrorCountOfAllTime(errorCountOfThisDomainAllTime); domain.setProcessCountOfAllTime(processCountOfThisDomainAllTime); domain.setErrorCountOfTheDay(errorCountOfThisDomainThisDay); domain.setProcessCountOfTheDay(processCountOfThisDomainThisDay); domainList.add(domain); } } jobList.addAll(jobMap.values()); executorList.addAll(executorMap.values()); // 全域当天处理总数,失败总数 saveOrUpdateDomainProcessCount(new ZkStatistics(totalCount, errorCount), zkCluster.getZkAddr()); // 失败率Top10的域列表 saveOrUpdateTop10FailDomain(domainList, zkCluster.getZkAddr()); // 稳定性最差的Top10的域列表 saveOrUpdateTop10UnstableDomain(domainList, zkCluster.getZkAddr()); // 稳定性最差的Top10的executor列表 saveOrUpdateTop10FailExecutor(executorList, zkCluster.getZkAddr()); // 根据失败率Top10的作业列表 saveOrUpdateTop10FailJob(jobList, zkCluster.getZkAddr()); // 最活跃作业Top10的作业列表(即当天执行次数最多的作业) saveOrUpdateTop10ActiveJob(jobList, zkCluster.getZkAddr()); // 负荷最重的Top10的作业列表 saveOrUpdateTop10LoadJob(jobList, zkCluster.getZkAddr()); // 负荷最重的Top10的Executor列表 saveOrUpdateTop10LoadExecutor(executorList, zkCluster.getZkAddr()); // 异常作业列表 (如下次调度时间已经过了,但是作业没有被调度) saveOrUpdateAbnormalJob(unnormalJobList, zkCluster.getZkAddr()); // 超时告警的作业列表 saveOrUpdateTimeout4AlarmJob(timeout4AlarmJobList, zkCluster.getZkAddr()); // 无法高可用的作业列表 saveOrUpdateUnableFailoverJob(unableFailoverJobList, zkCluster.getZkAddr()); // 异常容器资源列表,包含实例数不匹配的资源列表 saveOrUpdateAbnormalContainer(abnormalContainerList, zkCluster.getZkAddr()); // 不同版本的域数量 saveOrUpdateVersionDomainNumber(versionDomainNumber, zkCluster.getZkAddr()); // 不同版本的executor数量 saveOrUpdateVersionExecutorNumber(versionExecutorNumber, zkCluster.getZkAddr()); UNNORMAL_JOB_LIST_CACHE.put(zkCluster.getZkAddr(), unnormalJobList); JOB_MAP_CACHE.put(zkCluster.getZkAddr(), jobMap); EXECUTOR_MAP_CACHE.put(zkCluster.getZkAddr(), executorMap); DOCKER_EXECUTOR_COUNT_MAP.put(zkCluster.getZkAddr(), exeInDocker); PHYSICAL_EXECUTOR_COUNT_MAP.put(zkCluster.getZkAddr(), exeNotInDocker); } } private void saveOrUpdateTop10FailExecutor(List<ExecutorStatistics> executorList, String zkAddr) { try { executorList = DashboardServiceHelper.sortExecutorByFailureRate(executorList); List<ExecutorStatistics> top10FailExecutor = executorList.subList(0, executorList.size() > 9?10:executorList.size()); String top10FailExecutorJsonString = JSON.toJSONString(top10FailExecutor); SaturnStatistics top10FailExecutorFromDB = saturnStatisticsService.findStatisticsByNameAndZkList(StatisticsTableKeyConstant.TOP_10_FAIL_EXECUTOR, zkAddr); if (top10FailExecutorFromDB == null) { SaturnStatistics ss = new SaturnStatistics(StatisticsTableKeyConstant.TOP_10_FAIL_EXECUTOR, zkAddr, top10FailExecutorJsonString); saturnStatisticsService.create(ss); } else { top10FailExecutorFromDB.setResult(top10FailExecutorJsonString); saturnStatisticsService.updateByPrimaryKey(top10FailExecutorFromDB); } } catch (Exception e) { log.error(e.getMessage(), e); } } private void saveOrUpdateTop10FailDomain(List<DomainStatistics> domainList, String zkAddr) { try { domainList = DashboardServiceHelper.sortDomainByAllTimeFailureRate(domainList); List<DomainStatistics> top10FailDomainList = domainList.subList(0, domainList.size() > 9? 10:domainList.size()); String top10FailDomainJsonString = JSON.toJSONString(top10FailDomainList); SaturnStatistics top10FailDomainFromDB = saturnStatisticsService.findStatisticsByNameAndZkList(StatisticsTableKeyConstant.TOP_10_FAIL_DOMAIN, zkAddr); if (top10FailDomainFromDB == null) { SaturnStatistics ss = new SaturnStatistics(StatisticsTableKeyConstant.TOP_10_FAIL_DOMAIN, zkAddr, top10FailDomainJsonString); saturnStatisticsService.create(ss); } else { top10FailDomainFromDB.setResult(top10FailDomainJsonString); saturnStatisticsService.updateByPrimaryKey(top10FailDomainFromDB); } } catch (Exception e) { log.error(e.getMessage(), e); } } private void saveOrUpdateTop10UnstableDomain(List<DomainStatistics> domainList, String zkAddr) { try { domainList = DashboardServiceHelper.sortDomainByShardingCount(domainList); List<DomainStatistics> top10UnstableDomain = domainList.subList(0, domainList.size() > 9? 10:domainList.size()); String top10UnstableDomainJsonString = JSON.toJSONString(top10UnstableDomain); SaturnStatistics top10UnstableDomainFromDB = saturnStatisticsService.findStatisticsByNameAndZkList(StatisticsTableKeyConstant.TOP_10_UNSTABLE_DOMAIN, zkAddr); if (top10UnstableDomainFromDB == null) { SaturnStatistics ss = new SaturnStatistics(StatisticsTableKeyConstant.TOP_10_UNSTABLE_DOMAIN, zkAddr, top10UnstableDomainJsonString); saturnStatisticsService.create(ss); } else { top10UnstableDomainFromDB.setResult(top10UnstableDomainJsonString); saturnStatisticsService.updateByPrimaryKey(top10UnstableDomainFromDB); } } catch (Exception e) { log.error(e.getMessage(), e); } } private void saveOrUpdateTop10FailJob(List<JobStatistics> jobList, String zkAddr) { try { jobList = DashboardServiceHelper.sortJobByAllTimeFailureRate(jobList); List<JobStatistics> top10FailJob = jobList.subList(0, jobList.size() > 9?10:jobList.size()); String top10FailJobJsonString = JSON.toJSONString(top10FailJob); SaturnStatistics top10FailJobFromDB = saturnStatisticsService.findStatisticsByNameAndZkList(StatisticsTableKeyConstant.TOP_10_FAIL_JOB, zkAddr); if (top10FailJobFromDB == null) { SaturnStatistics ss = new SaturnStatistics(StatisticsTableKeyConstant.TOP_10_FAIL_JOB, zkAddr, top10FailJobJsonString); saturnStatisticsService.create(ss); } else { top10FailJobFromDB.setResult(top10FailJobJsonString); saturnStatisticsService.updateByPrimaryKey(top10FailJobFromDB); } } catch (Exception e) { log.error(e.getMessage(), e); } } private void saveOrUpdateTop10ActiveJob(List<JobStatistics> jobList, String zkAddr) { try { jobList = DashboardServiceHelper.sortJobByDayProcessCount(jobList); List<JobStatistics> top10ActiveJob = jobList.subList(0, jobList.size() > 9?10:jobList.size()); String top10ActiveJobJsonString = JSON.toJSONString(top10ActiveJob); SaturnStatistics top10ActiveJobFromDB = saturnStatisticsService.findStatisticsByNameAndZkList(StatisticsTableKeyConstant.TOP_10_ACTIVE_JOB, zkAddr); if (top10ActiveJobFromDB == null) { SaturnStatistics ss = new SaturnStatistics(StatisticsTableKeyConstant.TOP_10_ACTIVE_JOB, zkAddr, top10ActiveJobJsonString); saturnStatisticsService.create(ss); } else { top10ActiveJobFromDB.setResult(top10ActiveJobJsonString); saturnStatisticsService.updateByPrimaryKey(top10ActiveJobFromDB); } } catch (Exception e) { log.error(e.getMessage(), e); } } private void saveOrUpdateTop10LoadJob(List<JobStatistics> jobList, String zkAddr) { try { jobList = DashboardServiceHelper.sortJobByLoadLevel(jobList); List<JobStatistics> top10LoadJob = jobList.subList(0, jobList.size() > 9?10:jobList.size()); String top10LoadJobJsonString = JSON.toJSONString(top10LoadJob); SaturnStatistics top10LoadJobFromDB = saturnStatisticsService.findStatisticsByNameAndZkList(StatisticsTableKeyConstant.TOP_10_LOAD_JOB, zkAddr); if (top10LoadJobFromDB == null) { SaturnStatistics ss = new SaturnStatistics(StatisticsTableKeyConstant.TOP_10_LOAD_JOB, zkAddr, top10LoadJobJsonString); saturnStatisticsService.create(ss); } else { top10LoadJobFromDB.setResult(top10LoadJobJsonString); saturnStatisticsService.updateByPrimaryKey(top10LoadJobFromDB); } } catch (Exception e) { log.error(e.getMessage(), e); } } private void saveOrUpdateTop10LoadExecutor(List<ExecutorStatistics> executorList, String zkAddr) { try { executorList = DashboardServiceHelper.sortExecutorByLoadLevel(executorList); List<ExecutorStatistics> top10LoadExecutor = executorList.subList(0, executorList.size() > 9?10:executorList.size()); String top10LoadExecutorJsonString = JSON.toJSONString(top10LoadExecutor); SaturnStatistics top10LoadExecutorFromDB = saturnStatisticsService.findStatisticsByNameAndZkList(StatisticsTableKeyConstant.TOP_10_LOAD_EXECUTOR, zkAddr); if (top10LoadExecutorFromDB == null) { SaturnStatistics ss = new SaturnStatistics(StatisticsTableKeyConstant.TOP_10_LOAD_EXECUTOR, zkAddr, top10LoadExecutorJsonString); saturnStatisticsService.create(ss); } else { top10LoadExecutorFromDB.setResult(top10LoadExecutorJsonString); saturnStatisticsService.updateByPrimaryKey(top10LoadExecutorFromDB); } } catch (Exception e) { log.error(e.getMessage(), e); } } private void saveOrUpdateDomainProcessCount(ZkStatistics zks, String zkAddr) { String domainListJsonString = JSON.toJSONString(zks); SaturnStatistics domainProcessCountFromDB = saturnStatisticsService.findStatisticsByNameAndZkList(StatisticsTableKeyConstant.DOMAIN_PROCESS_COUNT_OF_THE_DAY, zkAddr); if (domainProcessCountFromDB == null) { SaturnStatistics ss = new SaturnStatistics(StatisticsTableKeyConstant.DOMAIN_PROCESS_COUNT_OF_THE_DAY, zkAddr, domainListJsonString); saturnStatisticsService.create(ss); } else { domainProcessCountFromDB.setResult(domainListJsonString); saturnStatisticsService.updateByPrimaryKey(domainProcessCountFromDB); } } private void saveOrUpdateAbnormalJob(List<AbnormalJob> unnormalJobList, String zkAddr) { unnormalJobList = DashboardServiceHelper.sortUnnormaoJobByTimeDesc(unnormalJobList); String unnormalJobJsonString = JSON.toJSONString(unnormalJobList); SaturnStatistics unnormalJobFromDB = saturnStatisticsService.findStatisticsByNameAndZkList(StatisticsTableKeyConstant.UNNORMAL_JOB, zkAddr); if (unnormalJobFromDB == null) { SaturnStatistics ss = new SaturnStatistics(StatisticsTableKeyConstant.UNNORMAL_JOB, zkAddr, unnormalJobJsonString); saturnStatisticsService.create(ss); } else { unnormalJobFromDB.setResult(unnormalJobJsonString); saturnStatisticsService.updateByPrimaryKey(unnormalJobFromDB); } } private void saveOrUpdateTimeout4AlarmJob(List<Timeout4AlarmJob> timeout4AlarmJobList, String zkAddr) { String timeout4AlarmJobJsonString = JSON.toJSONString(timeout4AlarmJobList); SaturnStatistics timeout4AlarmJobFromDB = saturnStatisticsService.findStatisticsByNameAndZkList(StatisticsTableKeyConstant.TIMEOUT_4_ALARM_JOB, zkAddr); if (timeout4AlarmJobFromDB == null) { SaturnStatistics ss = new SaturnStatistics(StatisticsTableKeyConstant.TIMEOUT_4_ALARM_JOB, zkAddr, timeout4AlarmJobJsonString); saturnStatisticsService.create(ss); } else { timeout4AlarmJobFromDB.setResult(timeout4AlarmJobJsonString); saturnStatisticsService.updateByPrimaryKey(timeout4AlarmJobFromDB); } } private void saveOrUpdateUnableFailoverJob(List<AbnormalJob> unableFailoverJobList, String zkAddr) { String unableFailoverJobJsonString = JSON.toJSONString(unableFailoverJobList); SaturnStatistics unableFailoverJobFromDB = saturnStatisticsService.findStatisticsByNameAndZkList(StatisticsTableKeyConstant.UNABLE_FAILOVER_JOB, zkAddr); if (unableFailoverJobFromDB == null) { SaturnStatistics ss = new SaturnStatistics(StatisticsTableKeyConstant.UNABLE_FAILOVER_JOB, zkAddr, unableFailoverJobJsonString); saturnStatisticsService.create(ss); } else { unableFailoverJobFromDB.setResult(unableFailoverJobJsonString); saturnStatisticsService.updateByPrimaryKey(unableFailoverJobFromDB); } } private void saveOrUpdateAbnormalContainer(List<AbnormalContainer> abnormalContainerList, String zkAddr) { String abnormalContainerJsonString = JSON.toJSONString(abnormalContainerList); SaturnStatistics abnormalContainerFromDB = saturnStatisticsService.findStatisticsByNameAndZkList(StatisticsTableKeyConstant.ABNORMAL_CONTAINER, zkAddr); if (abnormalContainerFromDB == null) { SaturnStatistics ss = new SaturnStatistics(StatisticsTableKeyConstant.ABNORMAL_CONTAINER, zkAddr, abnormalContainerJsonString); saturnStatisticsService.create(ss); } else { abnormalContainerFromDB.setResult(abnormalContainerJsonString); saturnStatisticsService.updateByPrimaryKey(abnormalContainerFromDB); } } private void saveOrUpdateVersionDomainNumber(Map<String, Long> versionDomainNumber, String zkAddr) { try { String versionDomainNumberJsonString = JSON.toJSONString(versionDomainNumber); SaturnStatistics versionDomainNumberFromDB = saturnStatisticsService.findStatisticsByNameAndZkList(StatisticsTableKeyConstant.VERSION_DOMAIN_NUMBER, zkAddr); if (versionDomainNumberFromDB == null) { SaturnStatistics ss = new SaturnStatistics(StatisticsTableKeyConstant.VERSION_DOMAIN_NUMBER, zkAddr, versionDomainNumberJsonString); saturnStatisticsService.create(ss); } else { versionDomainNumberFromDB.setResult(versionDomainNumberJsonString); saturnStatisticsService.updateByPrimaryKey(versionDomainNumberFromDB); } } catch (Exception e) { log.error(e.getMessage(), e); } } private void saveOrUpdateVersionExecutorNumber(Map<String, Long> versionExecutorNumber, String zkAddr) { try { String versionExecutorNumberJsonString = JSON.toJSONString(versionExecutorNumber); SaturnStatistics versionExecutorNumberFromDB = saturnStatisticsService.findStatisticsByNameAndZkList(StatisticsTableKeyConstant.VERSION_EXECUTOR_NUMBER, zkAddr); if (versionExecutorNumberFromDB == null) { SaturnStatistics ss = new SaturnStatistics(StatisticsTableKeyConstant.VERSION_EXECUTOR_NUMBER, zkAddr, versionExecutorNumberJsonString); saturnStatisticsService.create(ss); } else { versionExecutorNumberFromDB.setResult(versionExecutorNumberJsonString); saturnStatisticsService.updateByPrimaryKey(versionExecutorNumberFromDB); } } catch (Exception e) { log.error(e.getMessage(), e); } } private void fillAbnormalJob(CuratorFramework curatorClient, AbnormalJob abnormalJob, String cause, String timeZone, long nextFireTimeExcludePausePeriod) throws Exception{ boolean areNotReady = true; String serverNodePath = JobNodePath.getServerNodePath(abnormalJob.getJobName()); if(checkExists(curatorClient, serverNodePath)) { List<String> servers = curatorClient.getChildren().forPath(serverNodePath); if(servers != null && !servers.isEmpty()) { for(String server : servers) { if(checkExists(curatorClient, JobNodePath.getServerStatus(abnormalJob.getJobName(), server))) { areNotReady = false; break; } } } } if(areNotReady) { cause = AbnormalJob.Cause.EXECUTORS_NOT_READY.name(); } abnormalJob.setCause(cause); abnormalJob.setTimeZone(timeZone); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); sdf.setTimeZone(TimeZone.getTimeZone(timeZone)); abnormalJob.setNextFireTimeWithTimeZoneFormat(sdf.format(nextFireTimeExcludePausePeriod)); abnormalJob.setNextFireTime(nextFireTimeExcludePausePeriod); } /** * 检查和处理问题作业 * @param curatorClient * @param abnormalJob * @param enabledPath * @param shardingItemStr * @param zkNodeCVersion * @param jobDegree * @param unnormalJobList * @throws Exception */ private void checkAndHandleJobProblem(CuratorFramework curatorClient, AbnormalJob abnormalJob, String enabledPath, String shardingItemStr, int zkNodeCVersion, String jobDegree, List<AbnormalJob> unnormalJobList) throws Exception{ if(unnormalJobList.contains(abnormalJob)){ return; } long nextFireTime = checkShardingState(curatorClient, abnormalJob, enabledPath, shardingItemStr); if (nextFireTime != -1 && doubleCheckShardingState(abnormalJob, shardingItemStr, zkNodeCVersion)) { if(abnormalJob.getCause() == null){ abnormalJob.setCause(AbnormalJob.Cause.NOT_RUN.name()); } String timeZone = getTimeZone(abnormalJob.getJobName(), curatorClient); //上报Hermes registerAbnormalJob(abnormalJob.getJobName(), abnormalJob.getDomainName(), timeZone, nextFireTime); //补充异常信息 fillAbnormalJob(curatorClient, abnormalJob, abnormalJob.getCause(), timeZone, nextFireTime); //增加到非正常作业列表 abnormalJob.setJobDegree(jobDegree); unnormalJobList.add(abnormalJob); log.info("Job sharding alert with DomainName: {}, JobName: {}, ShardingItem: {}, Cause: {}", abnormalJob.getDomainName(), abnormalJob.getJobName(), shardingItemStr, abnormalJob.getCause()); } } /** * 符合连续两次告警返回true,否则返回false * ALLOW_DELAY_MILLIONSECONDS * 1.5 钝化触发检查的时间窗口精度 * 告警触发条件: * 1、上次告警+本次检查窗口告警(连续2次) * 2、上次告警CVersion与本次一致(说明当前本次检查窗口时间内没有子节点变更) */ private static boolean doubleCheckShardingState(AbnormalJob abnormalJob, String shardingItemStr, int zkNodeCVersion){ String key = abnormalJob.getDomainName() + "_" + abnormalJob.getJobName() + "_" + shardingItemStr; long nowTime = System.currentTimeMillis(); if(ABNORMAL_SHARDING_STATE_CACHE.containsKey(key)){ AbnormalShardingState abnormalShardingState = ABNORMAL_SHARDING_STATE_CACHE.get(key); if(abnormalShardingState != null && abnormalShardingState.getAlertTime() + ALLOW_DELAY_MILLIONSECONDS * 1.5 > nowTime && abnormalShardingState.getZkNodeCVersion() == zkNodeCVersion){ ABNORMAL_SHARDING_STATE_CACHE.put(key, new AbnormalShardingState(nowTime, zkNodeCVersion));//更新告警 return true; }else{ ABNORMAL_SHARDING_STATE_CACHE.put(key, new AbnormalShardingState(nowTime, zkNodeCVersion));//更新无效(过时)告警 return false; } }else{ ABNORMAL_SHARDING_STATE_CACHE.put(key, new AbnormalShardingState(nowTime, zkNodeCVersion));//新增告警信息 return false; } } /** * 判断分片状态 * * 逻辑: * 1、有running节点,返回正常 * 2.1、有completed节点,但马上就取不到Mtime,节点有变动说明正常 * 2.2、根据Mtime计算下次触发时间,比较下次触发时间是否小于当前时间+延时, 是则为过时未跑有异常 * 3、既没有running又没completed视为异常 * @param curatorClient * @param abnormalJob * @param shardingItemStr * @return -1:状态正常,非-1:状态异常 * @throws Exception */ private long checkShardingState(CuratorFramework curatorClient, AbnormalJob abnormalJob, String enabledPath, String shardingItemStr) throws Exception{ List<String> itemChildren = curatorClient.getChildren().forPath(JobNodePath.getExecutionItemNodePath(abnormalJob.getJobName(), shardingItemStr)); //注意:针对stock-update域的不上报节点信息但又有分片残留的情况,分片节点下只有两个子节点,返回正常 if (itemChildren.size() != 2) { //有running节点,返回正常 if (itemChildren.contains("running")) { return -1; } //有completed节点:尝试取分片节点的Mtime时间 //1、能取到则根据Mtime计算下次触发时间,比较下次触发时间是否小于当前时间+延时, 是则为过时未跑有异常 //2、取不到(为0)说明completed节点刚好被删除了,节点有变动说明正常(上一秒还在,下一秒不在了) else if (itemChildren.contains("completed")) { String completedPath = JobNodePath.getExecutionNodePath(abnormalJob.getJobName(), shardingItemStr, "completed"); long completedMtime = getMtime(curatorClient, completedPath); if (completedMtime > 0) { // 对比minCompletedMtime与enabled mtime, 取最大值 long nextFireTimeAfterThis = getMtime(curatorClient, enabledPath); if (nextFireTimeAfterThis < completedMtime) { nextFireTimeAfterThis = completedMtime; } Long nextFireTimeExcludePausePeriod = jobDimensionService.getNextFireTimeAfterSpecifiedTimeExcludePausePeriod(nextFireTimeAfterThis, abnormalJob.getJobName(), new CuratorRepositoryImpl().newCuratorFrameworkOp(curatorClient)); // 下次触发时间是否小于当前时间+延时, 是则为过时未跑有异常 if (nextFireTimeExcludePausePeriod != null && nextFireTimeExcludePausePeriod + ALLOW_DELAY_MILLIONSECONDS < new Date().getTime() ) { return nextFireTimeExcludePausePeriod; } } else { return -1; } } // 既没有running又没completed视为异常 else { if (abnormalJob.getNextFireTimeAfterEnabledMtime() == 0) { abnormalJob.setNextFireTimeAfterEnabledMtime(jobDimensionService.getNextFireTimeAfterSpecifiedTimeExcludePausePeriod(getMtime(curatorClient, enabledPath), abnormalJob.getJobName(), new CuratorRepositoryImpl().newCuratorFrameworkOp(curatorClient))); } Long nextFireTime = abnormalJob.getNextFireTimeAfterEnabledMtime(); // 下次触发时间是否小于当前时间+延时, 是则为过时未跑有异常 if (nextFireTime != null && nextFireTime + ALLOW_DELAY_MILLIONSECONDS < new Date().getTime()) { return nextFireTime; } } } return -1; } private void checkJavaOrShellJobHasProblem(CuratorFramework curatorClient, AbnormalJob abnormalJob, String jobDegree, List<AbnormalJob> unnormalJobList) { try { // 计算异常作业,根据$Jobs/jobName/execution/item/nextFireTime,如果小于当前时间且作业不在running,则为异常 // 只有java/shell作业有cron String jobType = getData(curatorClient, JobNodePath.getConfigNodePath(abnormalJob.getJobName(), "jobType")); if (JobType.JAVA_JOB.name().equals(jobType) || JobType.SHELL_JOB.name().equals(jobType)) { // enabled 的作业才需要判断 String enabledPath = JobNodePath.getConfigNodePath(abnormalJob.getJobName(), "enabled"); if (Boolean.valueOf(getData(curatorClient, enabledPath))) { String enabledReportPath = JobNodePath.getConfigNodePath(abnormalJob.getJobName(), "enabledReport"); String enabledReportVal = getData(curatorClient, enabledReportPath); // 开启上报运行信息 if (enabledReportVal == null || "true".equals(enabledReportVal)) { String executionRootpath = JobNodePath.getExecutionNodePath(abnormalJob.getJobName()); // 有execution节点 List<String> items = null; try { items = curatorClient.getChildren().forPath(executionRootpath); }catch (Exception e) { } // 有分片 if (items != null && !items.isEmpty()) { int shardingTotalCount = Integer.parseInt(getData(curatorClient,JobNodePath.getConfigNodePath(abnormalJob.getJobName(), "shardingTotalCount"))); for (String itemStr : items) { int each = Integer.parseInt(itemStr); // 过滤历史遗留分片 if (each >= shardingTotalCount) { continue; } checkAndHandleJobProblem(curatorClient, abnormalJob, enabledPath, itemStr, getCVersion(curatorClient, JobNodePath.getExecutionItemNodePath(abnormalJob.getJobName(), itemStr)), jobDegree, unnormalJobList); } } else { // 无分片 abnormalJob.setCause(AbnormalJob.Cause.NO_SHARDS.name()); Long nextFireTime = jobDimensionService.getNextFireTimeAfterSpecifiedTimeExcludePausePeriod(getMtime(curatorClient, enabledPath), abnormalJob.getJobName(), new CuratorRepositoryImpl().newCuratorFrameworkOp(curatorClient)); // 下次触发时间是否小于当前时间+延时, 是则为过时未跑有异常 if (nextFireTime != null && nextFireTime + ALLOW_DELAY_MILLIONSECONDS < new Date().getTime() ) { String timeZone = getTimeZone(abnormalJob.getJobName(), curatorClient); //上报Hermes registerAbnormalJob(abnormalJob.getJobName(), abnormalJob.getDomainName(), timeZone, nextFireTime); //补充异常信息 fillAbnormalJob(curatorClient, abnormalJob, abnormalJob.getCause(), timeZone, nextFireTime); //增加到非正常作业列表 abnormalJob.setJobDegree(jobDegree); unnormalJobList.add(abnormalJob); log.info("Job sharding alert with DomainName: {}, JobName: {}, ShardingItem: {}, Cause: {}", abnormalJob.getDomainName(), abnormalJob.getJobName(), 0, abnormalJob.getCause()); } } } } } } catch (Exception e) { log.error(e.getMessage(), e); } } private String getTimeZone(String jobName, CuratorFramework curatorClient) { String timeZoneStr = getData(curatorClient, JobNodePath.getConfigNodePath(jobName, "timeZone")); if(timeZoneStr == null || timeZoneStr.trim().length() == 0) { timeZoneStr = SaturnConstants.TIME_ZONE_ID_DEFAULT; } return timeZoneStr; } /** * 如果配置了超时告警时间,而且running节点存在时间大于它,则告警 */ private Timeout4AlarmJob isTimeout4AlarmJob(Timeout4AlarmJob timeout4AlarmJob, CuratorFrameworkOp curatorFrameworkOp) { String jobName = timeout4AlarmJob.getJobName(); String timeout4AlarmSecondsStr = curatorFrameworkOp.getData(JobNodePath.getConfigNodePath(jobName, "timeout4AlarmSeconds")); int timeout4AlarmSeconds = 0; if(timeout4AlarmSecondsStr != null) { try { timeout4AlarmSeconds = Integer.parseInt(timeout4AlarmSecondsStr); } catch (NumberFormatException e) { log.error(e.getMessage(), e); } } if(timeout4AlarmSeconds > 0) { List<String> items = new ArrayList<>(); List<String> tmp = curatorFrameworkOp.getChildren(JobNodePath.getExecutionNodePath(jobName)); if (tmp != null) { items.addAll(tmp); } if (items != null && !items.isEmpty()) { long timeout4AlarmMills = timeout4AlarmSeconds * 1L * 1000; timeout4AlarmJob.setTimeout4AlarmSeconds(timeout4AlarmSeconds); for(String itemStr : items) { long ctime = curatorFrameworkOp.getCtime(JobNodePath.getExecutionNodePath(jobName, itemStr, "running")); if(ctime > 0 && System.currentTimeMillis() - ctime > timeout4AlarmMills) { timeout4AlarmJob.getTimeoutItems().add(Integer.parseInt(itemStr)); } } if(!timeout4AlarmJob.getTimeoutItems().isEmpty()) { try { reportAlarmService.dashboardTimeout4AlarmJob(timeout4AlarmJob.getDomainName(), jobName, timeout4AlarmJob.getTimeoutItems(), timeout4AlarmSeconds); } catch (Throwable t) { log.error(t.getMessage(), t); } return timeout4AlarmJob; } } } return null; } // 无法高可用的情况: // 1、勾选只使用优先executor,preferList只有一个物理机器(剔除offline、deleted的物理机) // 2、没有勾选只使用优先executor,没有选择容器资源,可供选择的preferList只有一个物理机器(剔除offline、deleted的物理机,剔除容器资源) private AbnormalJob isUnableFailoverJob(CuratorFramework curatorClient, AbnormalJob unableFailoverJob, CuratorFrameworkOp curatorFrameworkOp) { try { String jobName = unableFailoverJob.getJobName(); String preferList = getData(curatorClient, JobNodePath.getConfigNodePath(jobName, "preferList")); Boolean onlyUsePreferList = !Boolean.valueOf(getData(curatorClient, JobNodePath.getConfigNodePath(jobName, "useDispreferList"))); String preferListCandidateStr = jobDimensionService.getAllExecutors(jobName,curatorFrameworkOp); List<String> preferListArr = new ArrayList<>(); if(preferList != null && preferList.trim().length() > 0) { String[] split = preferList.split(","); for(String prefer : split) { String tmp = prefer.trim(); if(tmp.length() > 0) { if(!preferListArr.contains(tmp)) { preferListArr.add(tmp); } } } } if(preferListCandidateStr != null && preferListCandidateStr.trim().length() > 0) { String[] preferListCandidateArr = preferListCandidateStr.split(","); if (onlyUsePreferList) { boolean containerSelected = false; int count = 0; for(String preferListCandidate : preferListCandidateArr) { String tmp = preferListCandidate.split("\\(")[0]; if(preferListCandidate.indexOf("容器资源") != -1) { tmp = "@" + tmp; } if(preferListArr.contains(tmp)) { if (preferListCandidate.indexOf("容器资源") != -1) { containerSelected = true; break; } else { if (preferListCandidate.indexOf("已离线") == -1 && preferListCandidate.indexOf("已删除") == -1) { count++; } } } } if(!containerSelected && count == 1) { return unableFailoverJob; } } else { boolean containerSelected = false; int count = 0; for(String preferListCandidate : preferListCandidateArr) { if(preferListCandidate.indexOf("容器资源") != -1 && preferListArr.contains("@" + preferListCandidate.split("\\(")[0])) { containerSelected = true; break; } if(preferListCandidate.indexOf("已离线") == -1 && preferListCandidate.indexOf("已删除") == -1 && preferListCandidate.indexOf("容器资源") == -1) { count++; } } if(!containerSelected && count == 1) { return unableFailoverJob; } } } return null; } catch (Exception e) { log.error(e.getMessage(), e); return null; } } private void registerAbnormalJob(String job, String domain, String timeZone, Long nextFireTimeValue) { try { reportAlarmService.dashboardAbnormalJob(domain, job, timeZone, nextFireTimeValue); } catch (Throwable t) { log.error(t.getMessage(), t); } } private AbnormalContainer isContainerInstanceMismatch(AbnormalContainer abnormalContainer, CuratorFrameworkOp curatorFrameworkOp) { try { String taskId = abnormalContainer.getTaskId(); String dcosTaskConfigNodePath = ContainerNodePath.getDcosTaskConfigNodePath(taskId); long configMtime = curatorFrameworkOp.getMtime(dcosTaskConfigNodePath); String dcosTaskScaleJobsNodePath = ContainerNodePath.getDcosTaskScaleJobsNodePath(taskId); List<String> scaleJobs = curatorFrameworkOp.getChildren(dcosTaskScaleJobsNodePath); long maxItemMtime = 0L; String lastScalaJob = null; if (scaleJobs != null && !taskId.isEmpty()) { for (String scaleJob : scaleJobs) { String completedNodePath = JobNodePath.getExecutionNodePath(scaleJob, "0", "completed"); long completedMtime = curatorFrameworkOp.getMtime(completedNodePath); if (completedMtime > maxItemMtime) { lastScalaJob = scaleJob; maxItemMtime = completedMtime; } } } Integer myInstance = -1; if (configMtime > maxItemMtime) { String taskConfigData = curatorFrameworkOp.getData(dcosTaskConfigNodePath); if (taskConfigData != null && taskConfigData.trim().length() > 0) { ContainerConfig containerConfig = JSON.parseObject(taskConfigData, ContainerConfig.class); myInstance = containerConfig.getInstances(); } } else if (configMtime < maxItemMtime) { String dcosTaskScaleJobNodePath = ContainerNodePath.getDcosTaskScaleJobNodePath(taskId, lastScalaJob); String scaleJobData = curatorFrameworkOp.getData(dcosTaskScaleJobNodePath); if (scaleJobData != null && scaleJobData.trim().length() > 0) { ContainerScaleJob containerScaleJob = JSON.parseObject(scaleJobData, ContainerScaleJob.class); myInstance = containerScaleJob.getContainerScaleJobConfig().getInstances(); } } if (myInstance != -1) { int count = containerService.getContainerRunningInstances(taskId, curatorFrameworkOp); if(myInstance != count) { abnormalContainer.setCause(AbnormalContainer.Cause.CONTAINER_INSTANCE_MISMATCH.name()); abnormalContainer.setConfigInstances(myInstance); abnormalContainer.setRunningInstances(count); try { reportAlarmService.dashboardContainerInstancesMismatch(abnormalContainer.getDomainName(), abnormalContainer.getTaskId(), abnormalContainer.getConfigInstances(), abnormalContainer.getRunningInstances()); } catch (Exception e) { log.error(e.getMessage(), e); } return abnormalContainer; } } } catch (Exception e) { log.error(e.getMessage(), e); } return null; } public boolean checkExists(final CuratorFramework curatorClient, final String znode) { try { return null != curatorClient.checkExists().forPath(znode); // CHECKSTYLE:OFF } catch (final Exception ex) { // CHECKSTYLE:ON throw new JobConsoleException(ex); } } public long getMtime(final CuratorFramework curatorClient, final String znode) { try { Stat stat = curatorClient.checkExists().forPath(znode); if (stat != null) { return stat.getMtime(); } else { return 0l; } } catch (final Exception ex) { // CHECKSTYLE:ON throw new JobConsoleException(ex); } } public int getCVersion(final CuratorFramework curatorClient, final String znode) { try { Stat stat = curatorClient.checkExists().forPath(znode); if (stat != null) { return stat.getCversion(); } else { return 0; } } catch (final Exception ex) { // CHECKSTYLE:ON throw new JobConsoleException(ex); } } public String getData(final CuratorFramework curatorClient, final String znode) { try { if (checkExists(curatorClient, znode)) { byte[] getZnodeData = curatorClient.getData().forPath(znode); if (getZnodeData == null) {// executor的分片可能存在全部飘走的情况,sharding节点有可能获取到的是null,需要对null做判断,否则new // String时会报空指针异常 return null; } return new String(getZnodeData, Charset.forName("UTF-8")); } else { return null; } } catch (final NoNodeException ex) { return null; // CHECKSTYLE:OFF } catch (final Exception ex) { // CHECKSTYLE:ON throw new JobConsoleException(ex); } } @Override public SaturnStatistics top10FailureJob(String zklist) { return saturnStatisticsService.findStatisticsByNameAndZkList(StatisticsTableKeyConstant.TOP_10_FAIL_JOB, zklist); } @Override public SaturnStatistics top10FailureExecutor(String zklist) { return saturnStatisticsService.findStatisticsByNameAndZkList(StatisticsTableKeyConstant.TOP_10_FAIL_EXECUTOR, zklist); } @Override public SaturnStatistics top10AactiveJob(String zklist) { return saturnStatisticsService.findStatisticsByNameAndZkList(StatisticsTableKeyConstant.TOP_10_ACTIVE_JOB, zklist); } @Override public SaturnStatistics top10LoadExecutor(String zklist) { return saturnStatisticsService.findStatisticsByNameAndZkList(StatisticsTableKeyConstant.TOP_10_LOAD_EXECUTOR, zklist); } @Override public SaturnStatistics top10LoadJob(String zklist) { return saturnStatisticsService.findStatisticsByNameAndZkList(StatisticsTableKeyConstant.TOP_10_LOAD_JOB, zklist); } @Override public SaturnStatistics top10UnstableDomain(String zklist) { return saturnStatisticsService.findStatisticsByNameAndZkList(StatisticsTableKeyConstant.TOP_10_UNSTABLE_DOMAIN, zklist); } @Override public SaturnStatistics allProcessAndErrorCountOfTheDay(String zklist) { return saturnStatisticsService.findStatisticsByNameAndZkList(StatisticsTableKeyConstant.DOMAIN_PROCESS_COUNT_OF_THE_DAY, zklist); } @Override public SaturnStatistics allUnnormalJob(String zklist) { return saturnStatisticsService.findStatisticsByNameAndZkList(StatisticsTableKeyConstant.UNNORMAL_JOB, zklist); } @Override public SaturnStatistics allTimeout4AlarmJob(String zklist) { return saturnStatisticsService.findStatisticsByNameAndZkList(StatisticsTableKeyConstant.TIMEOUT_4_ALARM_JOB, zklist); } @Override public SaturnStatistics allUnableFailoverJob(String zklist) { return saturnStatisticsService.findStatisticsByNameAndZkList(StatisticsTableKeyConstant.UNABLE_FAILOVER_JOB, zklist); } @Override public SaturnStatistics top10FailureDomain(String zklist) { return saturnStatisticsService.findStatisticsByNameAndZkList(StatisticsTableKeyConstant.TOP_10_FAIL_DOMAIN, zklist); } @Override public void cleanShardingCount(String nns) throws Exception { // 获取当前连接 RegistryCenterClient registryCenterClient = registryCenterService.connect(nns); CuratorFramework curatorClient = registryCenterClient.getCuratorClient(); if (checkExists(curatorClient, ExecutorNodePath.SHARDING_COUNT_PATH)) { curatorClient.setData().forPath(ExecutorNodePath.SHARDING_COUNT_PATH, "0".getBytes()); } else { curatorClient.create().forPath(ExecutorNodePath.SHARDING_COUNT_PATH, "0".getBytes()); } asyncRefreshStatistics(); } @Override public void cleanOneJobAnalyse(String jobName, String nns) throws Exception { // 获取当前连接 RegistryCenterClient registryCenterClient = registryCenterService.connect(nns); CuratorFramework curatorClient = registryCenterClient.getCuratorClient(); // reset analyse data. updateResetValue(curatorClient, jobName, ResetCountType.RESET_ANALYSE); resetOneJobAnalyse(jobName, curatorClient); asyncRefreshStatistics(); } @Override public void cleanAllJobAnalyse(String nns) throws Exception { // 获取当前连接 RegistryCenterClient registryCenterClient = registryCenterService.connect(nns); CuratorFramework curatorClient = registryCenterClient.getCuratorClient(); CuratorFrameworkOp curatorFrameworkOp = curatorRepository.newCuratorFrameworkOp(curatorClient); // 遍历所有$Jobs子节点,非系统作业 List<String> jobs = jobDimensionService.getAllUnSystemJobs(curatorFrameworkOp); for (String job : jobs) { resetOneJobAnalyse(job, curatorClient); // reset analyse data. updateResetValue(curatorClient, job, ResetCountType.RESET_ANALYSE); } asyncRefreshStatistics(); } @Override public void cleanAllJobExecutorCount(String nns) throws Exception { // 获取当前连接 RegistryCenterClient registryCenterClient = registryCenterService.connect(nns); CuratorFramework curatorClient = registryCenterClient.getCuratorClient(); CuratorFrameworkOp curatorFrameworkOp = curatorRepository.newCuratorFrameworkOp(curatorClient); // 遍历所有$Jobs子节点,非系统作业 List<String> jobs = jobDimensionService.getAllUnSystemJobs(curatorFrameworkOp); for (String job : jobs) { resetOneJobExecutorCount(job, curatorClient); // reset all jobs' executor's success/failure count. updateResetValue(curatorClient, job, ResetCountType.RESET_SERVERS); } asyncRefreshStatistics(); } @Override public void cleanOneJobExecutorCount(String jobName, String nns) throws Exception { // 获取当前连接 RegistryCenterClient registryCenterClient = registryCenterService.connect(nns); CuratorFramework curatorClient = registryCenterClient.getCuratorClient(); // reset executor's success/failure count. updateResetValue(curatorClient, jobName, ResetCountType.RESET_SERVERS); resetOneJobExecutorCount(jobName, curatorClient); asyncRefreshStatistics(); } private void resetOneJobExecutorCount(String jobName, CuratorFramework curatorClient) throws Exception { if (null != curatorClient.checkExists().forPath(JobNodePath.getServerNodePath(jobName))) { List<String> servers = curatorClient.getChildren().forPath(JobNodePath.getServerNodePath(jobName)); for (String server : servers) { if (checkExists(curatorClient, JobNodePath.getProcessSucessCount(jobName, server))) { curatorClient.setData().forPath(JobNodePath.getProcessSucessCount(jobName, server), "0".getBytes()); } else { curatorClient.create().forPath(JobNodePath.getProcessSucessCount(jobName, server), "0".getBytes()); } if (checkExists(curatorClient, JobNodePath.getProcessFailureCount(jobName, server))) { curatorClient.setData().forPath(JobNodePath.getProcessFailureCount(jobName, server), "0".getBytes()); } else { curatorClient.create().forPath(JobNodePath.getProcessFailureCount(jobName, server), "0".getBytes()); } } } } private void resetOneJobAnalyse(String jobName, CuratorFramework curatorClient) throws Exception { if (checkExists(curatorClient, JobNodePath.getProcessCountPath(jobName))) { curatorClient.setData().forPath(JobNodePath.getProcessCountPath(jobName), "0".getBytes()); } else { curatorClient.create().forPath(JobNodePath.getProcessCountPath(jobName), "0".getBytes()); } if (checkExists(curatorClient, JobNodePath.getErrorCountPath(jobName))) { curatorClient.setData().forPath(JobNodePath.getErrorCountPath(jobName), "0".getBytes()); } else { curatorClient.create().forPath(JobNodePath.getErrorCountPath(jobName), "0".getBytes()); } } private void updateResetValue(CuratorFramework curatorFramework, String job, String value) throws Exception { String path = JobNodePath.getAnalyseResetPath(job); if (checkExists(curatorFramework, JobNodePath.getAnalyseResetPath(job))) { curatorFramework.setData().forPath(path, value.getBytes()); } else { curatorFramework.create().creatingParentsIfNeeded().forPath(path, value.getBytes()); } } private void asyncRefreshStatistics() { if(ConsoleUtil.isDashboardOn()){ singleThreadExecutor.submit(refreshStatisticsTask()); } } @Override public Map<String, Integer> loadDomainRankDistribution(String zkBsKey) { Map<String, Integer> domainMap = new HashMap<>(); for (RegistryCenterConfiguration config : RegistryCenterServiceImpl.ZKADDR_TO_ZKCLUSTER_MAP.get(zkBsKey).getRegCenterConfList()) { Integer count = domainMap.get(config.getDegree()); if (null != config.getDegree()) { domainMap.put(config.getDegree(), count == null?1:count + 1); } } return domainMap; } @Override public Map<Integer, Integer> loadJobRankDistribution(String zkBsKey) { Map<Integer, Integer> jobDegreeMap = new HashMap<>(); HashMap<String, JobStatistics> jobStatisticsMap = JOB_MAP_CACHE.get(zkBsKey); if(jobStatisticsMap == null || jobStatisticsMap.values().isEmpty()){ return jobDegreeMap; } for (JobStatistics jobStatistics : jobStatisticsMap.values()) { Integer count = jobDegreeMap.get(jobStatistics.getJobDegree()); jobDegreeMap.put(jobStatistics.getJobDegree(), count == null?1:count + 1); } return jobDegreeMap; } @Override public SaturnStatistics abnormalContainer(String zklist) { return saturnStatisticsService.findStatisticsByNameAndZkList(StatisticsTableKeyConstant.ABNORMAL_CONTAINER, zklist); } @Override public Map<String, Long> versionDomainNumber(String currentZkAddr) { SaturnStatistics ss = saturnStatisticsService.findStatisticsByNameAndZkList(StatisticsTableKeyConstant.VERSION_DOMAIN_NUMBER, currentZkAddr); if(ss != null) { String result = ss.getResult(); return JSON.parseObject(result, new TypeReference<Map<String, Long>>(){}); } else { return new HashMap<>(); } } @Override public Map<String, Long> versionExecutorNumber(String currentZkAddr) { SaturnStatistics ss = saturnStatisticsService.findStatisticsByNameAndZkList(StatisticsTableKeyConstant.VERSION_EXECUTOR_NUMBER, currentZkAddr); if(ss != null) { String result = ss.getResult(); return JSON.parseObject(result, new TypeReference<Map<String, Long>>(){}); } else { return new HashMap<>(); } } /** * 清理AbnormalShardingState过期Cache * @author jamin.li */ private static class AbnormalShardingCacheCleaner implements Runnable { @Override public void run() { for (Entry<String, AbnormalShardingState> entrySet : ABNORMAL_SHARDING_STATE_CACHE.entrySet()) { AbnormalShardingState shardingState = entrySet.getValue(); if(shardingState.getAlertTime() + ALLOW_DELAY_MILLIONSECONDS * 2 < System.currentTimeMillis()){ ABNORMAL_SHARDING_STATE_CACHE.remove(entrySet.getKey()); log.info("Clean ABNORMAL_SHARDING_STATE_CACHE with key: {}, alertTime: {}, zkNodeCVersion: {}: " + entrySet.getKey(), shardingState.getAlertTime(), shardingState.getZkNodeCVersion()); } } } } }