/** * Copyright 2016 vip.com. * <p> * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the * specific language governing permissions and limitations under the License. * </p> */ package com.vip.saturn.job.internal.failover; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.concurrent.TimeUnit; import com.vip.saturn.job.internal.storage.JobNodePath; import org.apache.zookeeper.CreateMode; import org.apache.zookeeper.KeeperException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.vip.saturn.job.basic.AbstractSaturnService; import com.vip.saturn.job.basic.JobScheduler; import com.vip.saturn.job.internal.config.ConfigurationNode; import com.vip.saturn.job.internal.execution.ExecutionNode; import com.vip.saturn.job.internal.storage.LeaderExecutionCallback; /** * 作业失效转移服务. * @author dylan.xue */ public class FailoverService extends AbstractSaturnService { static Logger log = LoggerFactory.getLogger(FailoverService.class); public FailoverService(final JobScheduler jobScheduler) { super(jobScheduler); } @Override public void start(){ } /** * 设置失效的分片项标记. * * @param item 崩溃的作业项 */ public void createCrashedFailoverFlag(final int item) { if (!isFailoverAssigned(item)) { try { getJobNodeStorage().getClient().create().creatingParentsIfNeeded().withMode(CreateMode.PERSISTENT).forPath(JobNodePath.getNodeFullPath(jobName, FailoverNode.getItemsNode(item))); log.info("{} - {} create failover flag of item {}", executorName, jobName, item); } catch (KeeperException.NodeExistsException e) { log.debug("{} - {} create failover flag of item {} failed, because it is already existing", executorName, jobName, item); } catch (Exception e) { log.error(e.getMessage(), e); } } } public boolean isFailoverAssigned(final Integer item) { return getJobNodeStorage().isJobNodeExisted(FailoverNode.getExecutionFailoverNode(item)); } /** * 如果需要失效转移, 则设置作业失效转移. */ public void failoverIfNecessary() { if (!needFailover()) { return; } getJobNodeStorage().executeInLeader(FailoverNode.LATCH, new FailoverLeaderExecutionCallback(), 1, TimeUnit.MINUTES, new FailoverTimeoutLeaderExecutionCallback()); } private boolean needFailover() { return getJobNodeStorage().isJobNodeExisted(FailoverNode.ITEMS_ROOT) && !getJobNodeStorage().getJobNodeChildrenKeys(FailoverNode.ITEMS_ROOT).isEmpty() && getJobNodeStorage().isJobNodeExisted(ConfigurationNode.ENABLED) && Boolean.valueOf(getJobNodeStorage().getJobNodeData(ConfigurationNode.ENABLED)); } /** * 更新执行完毕失效转移的分片项状态. * * @param item 执行完毕失效转移的分片项列表 */ public void updateFailoverComplete(final Integer item) { getJobNodeStorage().removeJobNodeIfExisted(FailoverNode.getExecutionFailoverNode(item)); } /** * 获取运行在本作业服务器的失效转移序列号. * * @return 运行在本作业服务器的失效转移序列号 */ public List<Integer> getLocalHostFailoverItems() { List<String> items = getJobNodeStorage().getJobNodeChildrenKeys(ExecutionNode.ROOT); List<Integer> result = new ArrayList<>(items.size()); for (String each : items) { int item = Integer.parseInt(each); String node = FailoverNode.getExecutionFailoverNode(item); if (getJobNodeStorage().isJobNodeExisted(node) && executorName.equals(getJobNodeStorage().getJobNodeDataDirectly(node))) { result.add(item); } } Collections.sort(result); return result; } /** * 获取运行在本作业服务器的被失效转移的序列号. * * @return 运行在本作业服务器的被失效转移的序列号 */ public List<Integer> getLocalHostTakeOffItems() { List<Integer> shardingItems = jobScheduler.getShardingService().getLocalHostShardingItems(); List<Integer> result = new ArrayList<>(shardingItems.size()); for (int each : shardingItems) { if (getJobNodeStorage().isJobNodeExisted(FailoverNode.getExecutionFailoverNode(each))) { result.add(each); } } return result; } /** * 删除作业失效转移信息. */ public void removeFailoverInfo() { getJobNodeStorage().removeJobNodeIfExisted(FailoverNode.ITEMS_ROOT); for (String each : getJobNodeStorage().getJobNodeChildrenKeys(ExecutionNode.ROOT)) { getJobNodeStorage().removeJobNodeIfExisted(FailoverNode.getExecutionFailoverNode(Integer.parseInt(each))); } } class FailoverLeaderExecutionCallback implements LeaderExecutionCallback { @Override public void execute() { if (!needFailover()) { return; } if(jobScheduler == null){ return; } if(!jobScheduler.getConfigService().getPreferList().contains(executorName) && !jobScheduler.getConfigService().isUseDispreferList()){ return; } List<String> items = getJobNodeStorage().getJobNodeChildrenKeys(FailoverNode.ITEMS_ROOT); if(items != null && !items.isEmpty()) { int crashedItem = Integer.parseInt(getJobNodeStorage().getJobNodeChildrenKeys(FailoverNode.ITEMS_ROOT).get(0)); log.info("[{}] msg=Elastic job: failover job begin, crashed item:{}.", jobName, crashedItem); getJobNodeStorage().fillEphemeralJobNode(FailoverNode.getExecutionFailoverNode(crashedItem), executorName); getJobNodeStorage().removeJobNodeIfExisted(FailoverNode.getItemsNode(crashedItem)); jobScheduler.triggerJob(); } } } class FailoverTimeoutLeaderExecutionCallback implements LeaderExecutionCallback { @Override public void execute() { log.warn("Failover leader election timeout with a minute"); } } }