/** * vips Inc. Copyright (c) 2016 All Rights Reserved. */ package com.vip.saturn.it.impl; import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.fail; import java.util.List; import org.junit.*; import com.vip.saturn.it.AbstractSaturnIT; import com.vip.saturn.it.JobType; import com.vip.saturn.it.job.LongtimeJavaJob; import com.vip.saturn.job.internal.config.JobConfiguration; import com.vip.saturn.job.internal.execution.ExecutionNode; import com.vip.saturn.job.internal.sharding.ShardingNode; import com.vip.saturn.job.internal.storage.JobNodePath; import com.vip.saturn.job.utils.ItemUtils; import org.junit.runners.MethodSorters; @FixMethodOrder(MethodSorters.NAME_ASCENDING) public class FailoverIT extends AbstractSaturnIT { @BeforeClass public static void setUp() throws Exception { startNamespaceShardingManagerList(1); } @AfterClass public static void tearDown() throws Exception { stopNamespaceShardingManagerList(); } @Before public void before() { LongtimeJavaJob.statusMap.clear(); } @After public void after() { LongtimeJavaJob.statusMap.clear(); } /** * 场景1:如果有空闲的Executor,failover就会立即执行,不需要等到主节点sharding完成 * Executor个数 > 分片个数的情况 * * @throws Exception */ @Test public void test_A_JavaJob() throws Exception { startExecutorList(3);// 设置3个Executor final int shardCount = 2;// 设置2个分片 final String jobName = "failoverITJobJava1"; failover(shardCount, jobName); stopExecutorList(); } /** * 场景2:普通的failover场景 * Executor个数 = 分片个数的情况 * * @throws Exception */ @Test public void test_B_JavaJob() throws Exception { startExecutorList(2);// 设置2个Executor final int shardCount = 2;// 设置2个分片 final String jobName = "failoverITJobJava2"; failover(shardCount, jobName); stopExecutorList(); } /** * 场景3:在failover执行之前禁用的作业重新启用后不应该继续上次的failover流程 * * @throws Exception */ @Test public void test_C_JavaJob() throws Exception { startExecutorList(2);// 设置2个Executor final int shardCount = 2;// 设置2个分片 final String jobName = "failoverITJobJava3"; failoverWithDisabled(shardCount, jobName); stopExecutorList(); } /** * * @param shardCount * @param jobName * @throws InterruptedException * @throws Exception */ private void failover(final int shardCount, final String jobName) throws InterruptedException, Exception { for (int i = 0; i < shardCount; i++) { String key = jobName + "_" + i; LongtimeJavaJob.JobStatus status = new LongtimeJavaJob.JobStatus(); status.runningCount = 0; status.sleepSeconds = 10; status.finished = false; status.timeout = false; status.killed = false; LongtimeJavaJob.statusMap.put(key, status); } //1 新建一个执行时间为10S的作业,它只能手工触发 final JobConfiguration jobConfiguration = new JobConfiguration(jobName); jobConfiguration.setCron("0 0 1 1 * ?"); jobConfiguration.setJobType(JobType.JAVA_JOB.toString()); jobConfiguration.setJobClass(LongtimeJavaJob.class.getCanonicalName()); jobConfiguration.setShardingTotalCount(shardCount); jobConfiguration.setShardingItemParameters("0=0,1=1,2=2"); addJob(jobConfiguration); Thread.sleep(1000); //2 启动作业并立刻执行一次 enableJob(jobConfiguration.getJobName()); Thread.sleep(2000); runAtOnce(jobName); //3 保证全部作业分片正在运行中 try { waitForFinish(new FinishCheck() { @Override public boolean docheck() { for (int j = 0; j < shardCount; j++) { if (!regCenter .isExisted(JobNodePath.getNodeFullPath(jobName, ExecutionNode.getRunningNode(j)))) { return false; } } return true; } }, 6); } catch (Exception e) { e.printStackTrace(); fail(e.getMessage()); } Thread.sleep(2000); final List<Integer> items = ItemUtils.toItemList(regCenter.getDirectly(JobNodePath.getNodeFullPath(jobName, ShardingNode.getShardingNode(saturnExecutorList.get(0).getExecutorName())))); //4 停止第一个executor,在该executor上运行的分片会失败转移 stopExecutor(0); System.out.println("items:" + items); try { waitForFinish(new FinishCheck() { @Override public boolean docheck() { for (Integer item : items) { if (!isFailoverAssigned(jobConfiguration, item)) { return false; } } return true; } }, 20); } catch (Exception e) { e.printStackTrace(); fail(e.getMessage()); } Thread.sleep(1000); //5 检查停止的executor 上面的分片是否已经被KILL for (Integer item : items) { String key = jobName + "_" + item; LongtimeJavaJob.JobStatus status = LongtimeJavaJob.statusMap.get(key); if (!status.finished || !status.killed) { fail("should finish and killed"); } status.runningCount = 0; } //6 保证全部分片都会执行一次(被停止的executor上的分片会失败转移从而也会执行一次) try { waitForFinish(new FinishCheck() { @Override public boolean docheck() { for (int j = 0; j < shardCount; j++) { String key = jobName + "_" + j; LongtimeJavaJob.JobStatus status = LongtimeJavaJob.statusMap.get(key); if (status.runningCount <= 0) { return false; } } return true; } }, 60); } catch (Exception e) { e.printStackTrace(); fail(e.getMessage()); } removeJob(jobConfiguration.getJobName()); Thread.sleep(2000); LongtimeJavaJob.statusMap.clear(); } /** * 在failover执行之前禁用的作业重新启用后不应该继续上次的failover流程 * * @param shardCount * @param jobName * @throws InterruptedException * @throws Exception */ private void failoverWithDisabled(final int shardCount, final String jobName) throws InterruptedException, Exception { for (int i = 0; i < shardCount; i++) { String key = jobName + "_" + i; LongtimeJavaJob.JobStatus status = new LongtimeJavaJob.JobStatus(); status.runningCount = 0; status.sleepSeconds = 20; status.finished = false; status.timeout = false; status.killed = false; LongtimeJavaJob.statusMap.put(key, status); } //1 新建一个执行时间为10S的作业,它只能手工触发 final JobConfiguration jobConfiguration = new JobConfiguration(jobName); jobConfiguration.setCron("0 0 1 1 * ?"); jobConfiguration.setJobType(JobType.JAVA_JOB.toString()); jobConfiguration.setJobClass(LongtimeJavaJob.class.getCanonicalName()); jobConfiguration.setShardingTotalCount(shardCount); jobConfiguration.setShardingItemParameters("0=0,1=1,2=2"); addJob(jobConfiguration); Thread.sleep(1000); //2 启动作业并立刻执行一次 enableJob(jobConfiguration.getJobName()); Thread.sleep(2000); runAtOnce(jobName); //3 保证全部作业分片正在运行中 try { waitForFinish(new FinishCheck() { @Override public boolean docheck() { for (int j = 0; j < shardCount; j++) { if (!regCenter .isExisted(JobNodePath.getNodeFullPath(jobName, ExecutionNode.getRunningNode(j)))) { return false; } } return true; } }, 6); } catch (Exception e) { e.printStackTrace(); fail(e.getMessage()); } Thread.sleep(2000); final String firstExecutorName = saturnExecutorList.get(0).getExecutorName(); final List<Integer> items = ItemUtils.toItemList(regCenter.getDirectly(JobNodePath.getNodeFullPath(jobName, ShardingNode.getShardingNode(firstExecutorName)))); final String secondExecutorName = saturnExecutorList.get(1).getExecutorName(); final List<Integer> items2 = ItemUtils.toItemList(regCenter.getDirectly(JobNodePath.getNodeFullPath(jobName, ShardingNode.getShardingNode(secondExecutorName)))); //4 停止第一个executor,在该executor上运行的分片会失败转移 stopExecutor(0); System.out.println("items:" + items); //5 直到第一个Executor完全下线 try { waitForFinish(new FinishCheck() { @Override public boolean docheck() { if(isOnline(firstExecutorName)){// 判断该Executor是否在线 return false; } return true; } }, 20); } catch (Exception e) { e.printStackTrace(); fail(e.getMessage()); } //6 检查停止的executor 上面的分片是否已经被KILL for (Integer item : items) { String key = jobName + "_" + item; LongtimeJavaJob.JobStatus status = LongtimeJavaJob.statusMap.get(key); if (!status.finished || !status.killed) { fail("should finish and killed"); } status.runningCount = 0; } //7 检查运行executor2上的分片都正在运行,而且runningCount为0 for (Integer item : items2) { String key = jobName + "_" + item; LongtimeJavaJob.JobStatus status = LongtimeJavaJob.statusMap.get(key); if (status.finished || status.killed || status.timeout) { fail("should running"); } if(status.runningCount != 0) { fail("runningCount should be 0"); } } //8 禁用作业 disableJob(jobName); //9 等待executor2分片运行完 try { waitForFinish(new FinishCheck() { @Override public boolean docheck() { for (Integer item : items2) { String key = jobName + "_" + item; LongtimeJavaJob.JobStatus status = LongtimeJavaJob.statusMap.get(key); if(!status.finished) { return false; } } return true; } }, 20); } catch (Exception e) { e.printStackTrace(); fail(e.getMessage()); } //10 检测无failover信息 assertThat(noFailoverItems(jobConfiguration)); for (Integer item : items) { assertThat(isFailoverAssigned(jobConfiguration, item)).isEqualTo(false); } //11 检测只executor2的分片只运行了一次 Thread.sleep(2000); for (Integer item : items2) { String key = jobName + "_" + item; LongtimeJavaJob.JobStatus status = LongtimeJavaJob.statusMap.get(key); if (status.runningCount != 1) { fail("runningCount should be 1"); } } removeJob(jobConfiguration.getJobName()); Thread.sleep(2000); LongtimeJavaJob.statusMap.clear(); } }