/** * Licensed to Cloudera, Inc. under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. Cloudera, Inc. licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.cloudera.flume.agent.diskfailover; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import java.io.IOException; import java.util.concurrent.CountDownLatch; import java.util.concurrent.TimeUnit; import org.junit.After; import org.junit.Before; import org.junit.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.cloudera.flume.agent.DirectMasterRPC; import com.cloudera.flume.agent.FlumeNode; import com.cloudera.flume.agent.LivenessManager; import com.cloudera.flume.agent.MasterRPC; import com.cloudera.flume.conf.FlumeConfiguration; import com.cloudera.flume.master.FlumeMaster; import com.cloudera.util.NetUtils; public class TestDiskFailoverAgent { public static final Logger LOG = LoggerFactory .getLogger(TestDiskFailoverAgent.class); FlumeMaster master = null; FlumeConfiguration cfg; @Before public void setCfg() throws IOException { // Isolate tests by only using simple cfg store cfg = FlumeConfiguration.createTestableConfiguration(); cfg.set(FlumeConfiguration.MASTER_STORE, "memory"); cfg.set(FlumeConfiguration.WEBAPPS_PATH, "build/webapps"); } @After public void shutdownMaster() { if (master != null) { master.shutdown(); master = null; } } /** * This test starts a DFO agent that attempts to go to a port that shouldn't * be open. This triggers the error recovery mechanism. We then wait for 10s * and then simulate the node doing a heartbeat to a master which tells that * that node is decomissioned. As part of decomissioning, the act of closing * the dfo agent should not hang the node. */ @Test public void testActiveDFOClose() throws InterruptedException { final FlumeMaster master = new FlumeMaster(cfg); MasterRPC rpc = new DirectMasterRPC(master); final FlumeNode node = new FlumeNode(rpc, false, false); // should have nothing. assertEquals(0, node.getLogicalNodeManager().getNodes().size()); final CountDownLatch done = new CountDownLatch(1); new Thread("ActiveDFOClose") { public void run() { LivenessManager liveMan = node.getLivenessManager(); try { // update config node to something that will be interrupted. LOG.info("setting to invalid dfo host"); master.getSpecMan().setConfig("node1", "flow", "asciisynth(0)", "agentDFOSink(\"localhost\", 12345)"); master.getSpecMan().addLogicalNode(NetUtils.localhost(), "node1"); liveMan.heartbeatChecks(); Thread.sleep(10000); // update config node to something that will be interrupted. LOG.info("!!! decommissioning node on master"); master.getSpecMan().removeLogicalNode("node1"); // as node do heartbeat and update due to decommission liveMan.heartbeatChecks(); LOG.info("!!! node should be decommissioning on node"); } catch (Exception e) { LOG.info("closed caused an error out: " + e.getMessage(), e); // Right now it takes about 10 seconds for the dfo deco to error out. done.countDown(); return; // expected fail on purpose. } LOG.info("Clean close."); done.countDown(); } }.start(); // false means timeout, takes about 10 seconds to shutdown. assertTrue("close call hung the heartbeat", done .await(45, TimeUnit.SECONDS)); assertEquals(1, node.getLogicalNodeManager().getNodes().size()); } /** * This test starts a DFO agent that attempts to go to a port that shouldn't * be open. This triggers the error recovery mechanism. We then wait for 10s * and then simulate the node doing a heartbeat to a master which tells that * that node is decommissioned. As part of decommissioning, the act of closing * the dfo agent should not hang the node. * * This test differs from the previous by having an bad dns name/request that * will eventually fail (ubuntu/java1.6 takes about 10s) */ @Test public void testActiveDFOCloseBadDNS() throws InterruptedException { final FlumeMaster master = new FlumeMaster(cfg); MasterRPC rpc = new DirectMasterRPC(master); final FlumeNode node = new FlumeNode(rpc, false, false); // should have nothing. assertEquals(0, node.getLogicalNodeManager().getNodes().size()); final CountDownLatch done = new CountDownLatch(1); new Thread() { public void run() { LivenessManager liveMan = node.getLivenessManager(); try { // update config node to something that will be interrupted. LOG.info("setting to invalid dfo host"); master.getSpecMan().setConfig("node1", "flow", "asciisynth(0)", "agentDFOSink(\"invalid\", 12345)"); master.getSpecMan().addLogicalNode(NetUtils.localhost(), "node1"); liveMan.heartbeatChecks(); Thread.sleep(20000); // Takes 10s for dns to fail // update config node to something that will be interrupted. LOG.info("!!! decommissioning node on master"); master.getSpecMan().removeLogicalNode("node1"); liveMan.heartbeatChecks(); LOG.info("!!! logical node should be decommissioning on node"); } catch (Exception e) { LOG.error("closed caused an error out: " + e.getMessage(), e); // Right now it takes about 10 seconds for the dfo deco to error out. done.countDown(); return; // fail } LOG.info("Clean close."); done.countDown(); } }.start(); // false means timeout, takes about 10 seconds to shutdown. assertTrue("close call hung the heartbeat", done .await(45, TimeUnit.SECONDS)); assertEquals(1, node.getLogicalNodeManager().getNodes().size()); } /** * This test starts a E2E agent that attempts to go to a port that shouldn't * be open. This triggers the error recovery mechanism. We then wait for 10s * and then simulate the node doing a heartbeat to a master which tells that * that node is decommissioned. As part of decommissioning, the act of closing * the E2E agent should not hang the node. */ @Test public void testActiveE2EClose() throws InterruptedException { final FlumeMaster master = new FlumeMaster(cfg); MasterRPC rpc = new DirectMasterRPC(master); final FlumeNode node = new FlumeNode(rpc, false, false); // should have nothing. assertEquals(0, node.getLogicalNodeManager().getNodes().size()); final CountDownLatch done = new CountDownLatch(1); new Thread("TestDiskFailoverAgent") { public void run() { LivenessManager liveMan = node.getLivenessManager(); try { // update config node to something that will be interrupted. LOG.info("setting to invalid dfo host"); master.getSpecMan().setConfig("node1", "flow", "asciisynth(0)", "agentE2ESink(\"localhost\", 12345)"); master.getSpecMan().addLogicalNode(NetUtils.localhost(), "node1"); liveMan.heartbeatChecks(); Thread.sleep(10000); // update config node to something that will be interrupted. LOG.info("!!! decommissioning node on master"); master.getSpecMan().removeLogicalNode("node1"); liveMan.heartbeatChecks(); LOG.info("!!! node should be decommissioning on node"); } catch (Exception e) { LOG.error("closed caused an error out: " + e.getMessage(), e); // Right now it takes about 10 seconds for the dfo deco to error out. done.countDown(); return; // fail } LOG.info("Did not expect clean close!?"); } }.start(); // false means timeout, takes about 10 seconds to shutdown. assertTrue("close call hung the heartbeat", done .await(60, TimeUnit.SECONDS)); } /** * This test starts a E2E agent that attempts to go to a port that shouldn't * be open. This triggers the error recovery mechanism. We then wait for 10s * and then simulate the node doing a heartbeat to a master which tells that * that node is decommissioned. As part of decommissioning, the act of closing * the E2E agent should not hang the node. * * This test differs from the previous by having an bad dns name/request that * will eventually fail (ubuntu/java1.6 takes about 10s) */ @Test public void testActiveE2ECloseBadDNS() throws InterruptedException { final FlumeMaster master = new FlumeMaster(cfg); MasterRPC rpc = new DirectMasterRPC(master); final FlumeNode node = new FlumeNode(rpc, false, false); // should have nothing. assertEquals(0, node.getLogicalNodeManager().getNodes().size()); final CountDownLatch done = new CountDownLatch(1); new Thread() { public void run() { LivenessManager liveMan = node.getLivenessManager(); try { // update config node to something that will be interrupted. LOG.info("setting to invalid dfo host"); master.getSpecMan().setConfig("node1", "flow", "asciisynth(0)", "agentE2ESink(\"localhost\", 12345)"); master.getSpecMan().addLogicalNode(NetUtils.localhost(), "node1"); liveMan.heartbeatChecks(); Thread.sleep(15000); // Takes 10s for dns to fail // update config node to something that will be interrupted. LOG.info("!!! decommissioning node on master"); master.getSpecMan().removeLogicalNode("node1"); liveMan.heartbeatChecks(); LOG.info("!!! node should be decommissioning on node"); } catch (Exception e) { LOG.error("closed caused an error out: " + e.getMessage(), e); // Right now it takes about 10 seconds for the dfo deco to error out. done.countDown(); return; // fail } LOG.info("Did not expect clean close? "); } }.start(); // false means timeout, takes about 10 seconds to shutdown. assertTrue("close call hung the heartbeat", done.await(120, TimeUnit.SECONDS)); } }