/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.ha; import static org.junit.Assume.assumeTrue; import java.util.Random; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.util.Shell; import org.apache.hadoop.util.Time; import org.junit.After; import org.junit.Before; import org.junit.Test; import org.mockito.Mockito; import org.mockito.invocation.InvocationOnMock; import org.mockito.stubbing.Answer; /** * Stress test for ZKFailoverController. * Starts multiple ZKFCs for dummy services, and then performs many automatic * failovers. While doing so, ensures that a fake "shared resource" * (simulating the shared edits dir) is only owned by one service at a time. */ public class TestZKFailoverControllerStress extends ClientBaseWithFixes { private static final int STRESS_RUNTIME_SECS = 30; private static final int EXTRA_TIMEOUT_SECS = 10; private Configuration conf; private MiniZKFCCluster cluster; @Before public void setupConfAndServices() throws Exception { // skip tests on Windows until after resolution of ZooKeeper client bug assumeTrue(!Shell.WINDOWS); conf = new Configuration(); conf.set(ZKFailoverController.ZK_QUORUM_KEY, hostPort); this.cluster = new MiniZKFCCluster(conf, getServer(serverFactory)); } @After public void stopCluster() throws Exception { if (cluster != null) { cluster.stop(); } } /** * Simply fail back and forth between two services for the * configured amount of time, via expiring their ZK sessions. */ @Test(timeout=(STRESS_RUNTIME_SECS + EXTRA_TIMEOUT_SECS) * 1000) public void testExpireBackAndForth() throws Exception { cluster.start(); long st = Time.now(); long runFor = STRESS_RUNTIME_SECS * 1000; int i = 0; while (Time.now() - st < runFor) { // flip flop the services back and forth int from = i % 2; int to = (i + 1) % 2; // Expire one service, it should fail over to the other LOG.info("Failing over via expiration from " + from + " to " + to); cluster.expireAndVerifyFailover(from, to); i++; } } /** * Randomly expire the ZK sessions of the two ZKFCs. This differs * from the above test in that it is not a controlled failover - * we just do random expirations and expect neither one to ever * generate fatal exceptions. */ @Test(timeout=(STRESS_RUNTIME_SECS + EXTRA_TIMEOUT_SECS) * 1000) public void testRandomExpirations() throws Exception { cluster.start(); long st = Time.now(); long runFor = STRESS_RUNTIME_SECS * 1000; Random r = new Random(); while (Time.now() - st < runFor) { cluster.getTestContext().checkException(); int targetIdx = r.nextInt(2); ActiveStandbyElector target = cluster.getElector(targetIdx); long sessId = target.getZKSessionIdForTests(); if (sessId != -1) { LOG.info(String.format("Expiring session %x for svc %d", sessId, targetIdx)); getServer(serverFactory).closeSession(sessId); } Thread.sleep(r.nextInt(300)); } } /** * Have the services fail their health checks half the time, * causing the master role to bounce back and forth in the * cluster. Meanwhile, causes ZK to disconnect clients every * 50ms, to trigger the retry code and failures to become active. */ @Test(timeout=(STRESS_RUNTIME_SECS + EXTRA_TIMEOUT_SECS) * 1000) public void testRandomHealthAndDisconnects() throws Exception { long runFor = STRESS_RUNTIME_SECS * 1000; Mockito.doAnswer(new RandomlyThrow(0)) .when(cluster.getService(0).proxy).monitorHealth(); Mockito.doAnswer(new RandomlyThrow(1)) .when(cluster.getService(1).proxy).monitorHealth(); ActiveStandbyElector.NUM_RETRIES = 100; // Don't start until after the above mocking. Otherwise we can get // Mockito errors if the HM calls the proxy in the middle of // setting up the mock. cluster.start(); long st = Time.now(); while (Time.now() - st < runFor) { cluster.getTestContext().checkException(); serverFactory.closeAll(); Thread.sleep(50); } } /** * Randomly throw an exception half the time the method is called */ @SuppressWarnings("rawtypes") private static class RandomlyThrow implements Answer { private Random r = new Random(); private final int svcIdx; public RandomlyThrow(int svcIdx) { this.svcIdx = svcIdx; } @Override public Object answer(InvocationOnMock invocation) throws Throwable { if (r.nextBoolean()) { LOG.info("Throwing an exception for svc " + svcIdx); throw new HealthCheckFailedException("random failure"); } return invocation.callRealMethod(); } } }