/** * Licensed to Cloudera, Inc. under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. Cloudera, Inc. licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.cloudera.flume.agent.diskfailover; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import java.io.IOException; import java.util.HashMap; import java.util.Map; import java.util.Map.Entry; import org.junit.After; import org.junit.Before; import org.junit.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.cloudera.flume.agent.LogicalNode; import com.cloudera.flume.conf.Context; import com.cloudera.flume.conf.FlumeSpecException; import com.cloudera.flume.conf.LogicalNodeContext; import com.cloudera.flume.conf.ReportTestingContext; import com.cloudera.flume.conf.FlumeConfigData; import com.cloudera.flume.master.StatusManager.NodeState; import com.cloudera.flume.reporter.ReportEvent; import com.cloudera.flume.reporter.ReportManager; import com.cloudera.flume.reporter.aggregator.AccumulatorSink; import com.cloudera.util.BenchmarkHarness; import com.cloudera.util.Clock; /** * This tests the disk failover mode's behavior to make sure it works properly. * * TODO Several tests end by checking if the collector is in ERROR or IDLE * state. This could be because of the randomness in the pipeline but an not * completely sure about this explanation. **/ public class TestDiskFailoverBehavior { public static final Logger LOG = LoggerFactory .getLogger(TestDiskFailoverBehavior.class); @Before public void setup() { BenchmarkHarness.setupLocalWriteDir(); } @After public void teardown() throws IOException { BenchmarkHarness.cleanupLocalWriteDir(); } LogicalNode setupAgent(long count, String agentSink) throws IOException, RuntimeException, FlumeSpecException { LogicalNode agent = new LogicalNode( new LogicalNodeContext("phys", "agent"), "agent"); FlumeConfigData fcd = new FlumeConfigData(0, "asciisynth(" + count + ")", agentSink, 1, 1, "flow"); agent.loadConfig(fcd); return agent; } LogicalNode setupColl(long port, String name, String acc) throws IOException, RuntimeException, FlumeSpecException { Context ctx = new LogicalNodeContext(new ReportTestingContext(), "phys", name); LogicalNode coll = new LogicalNode(ctx, name); FlumeConfigData fcd2 = new FlumeConfigData(0, "rpcSource(" + port + ")", "accumulator(\"" + acc + "\")", 1, 1, "flow"); coll.loadConfig(fcd2); return coll; } void loopUntilCount(long count, LogicalNode coll, LogicalNode coll2) throws InterruptedException { boolean done = false; int loops = 0; AccumulatorSink ctr = (AccumulatorSink) ReportManager.get().getReportable( "count"); AccumulatorSink ctr2 = (AccumulatorSink) ReportManager.get().getReportable( "count2"); while (!done) { Clock.sleep(1000); long cnt1 = (ctr == null) ? 0 : ctr.getCount(); long cnt2 = (ctr2 == null) ? 0 : ctr2.getCount(); LOG.info("loop " + loops + " collector count = " + cnt1 + " count2 = " + cnt2); if (coll != null) { LOG.info(coll.getReport().toText()); } LOG.info(coll2.getReport().toText()); if (cnt1 + cnt2 >= count) break; loops++; } } /** * Test the DFO failure path assuming the retry sink is reliable, e.g. the * secondary/failover in this case will not error out. */ @Test public void testDFOPerfectRetry() throws IOException, RuntimeException, FlumeSpecException, InterruptedException { long count = 1000; // start the collectors first LogicalNode coll = setupColl(12345, "coll", "count"); LogicalNode coll2 = setupColl(12346, "coll2", "count2"); // then the agent so it can connect String agentSink = "< { flakeyAppend(.1, 1337) => rpcSink(\"localhost\",12345) } ?" + " {diskFailover => {insistentAppend => { lazyOpen " + "=> rpcSink(\"localhost\",12345) } } } >"; LogicalNode agent = setupAgent(count, agentSink); // wait until the counts add up properly AccumulatorSink ctr = (AccumulatorSink) ReportManager.get().getReportable( "count"); AccumulatorSink ctr2 = (AccumulatorSink) ReportManager.get().getReportable( "count2"); loopUntilCount(count, coll, coll2); assertEquals(NodeState.IDLE, agent.getStatus().state); // close off the collector coll.close(); coll2.close(); agent.close(); // check outout LOG.info("primary collector count = " + ctr.getCount()); LOG.info("secondary collector count = " + ctr2.getCount()); assertEquals(count, ctr.getCount() + ctr2.getCount()); // the collector can be in ERROR or IDLE state because of the randomness. NodeState stateColl = coll.getStatus().state; LOG.info("coll exited in state: " + stateColl); assertTrue(stateColl.equals(NodeState.IDLE) || stateColl.equals(NodeState.ERROR)); NodeState stateColl2 = coll2.getStatus().state; LOG.info("coll2 exited in state: " + stateColl2); assertTrue(stateColl2.equals(NodeState.IDLE) || stateColl2.equals(NodeState.ERROR)); } /** * Same test, now with a flakey sink to the diskFailover. This case is * supposed to propagate the problem through to the parent and throw some sort * of exception. */ @Test public void testDFOFlakeyRetry() throws IOException, RuntimeException, FlumeSpecException, InterruptedException { long count = 1000; // start the collectors first LogicalNode coll = setupColl(12345, "coll", "count"); LogicalNode coll2 = setupColl(12346, "coll2", "count2"); // Then the agent so it can connect. This version assumes that the // secondary/failover case will fail and pass // an exception back the primary. String agentSink = "< { flakeyAppend(.1,1337) => rpcSink(\"localhost\",12345) } ?" + " {diskFailover => {lazyOpen => {flakeyAppend(.1,1337) " + "=> rpcSink(\"localhost\",12346) } } } >"; LogicalNode agent = setupAgent(count, agentSink); // wait for agent done // wait until the counts add up properly boolean done = false; int loops = 0; AccumulatorSink ctr = (AccumulatorSink) ReportManager.get().getReportable( "count"); AccumulatorSink ctr2 = (AccumulatorSink) ReportManager.get().getReportable( "count2"); long old = 0; while (!done) { Clock.sleep(1000); LOG.info("loop " + loops + " collector count = " + ctr.getCount() + " count2 = " + ctr2.getCount()); LOG.info(coll.getReport().toText()); LOG.info(coll2.getReport().toText()); if (old == ctr.getCount()) { break; } old = ctr.getCount(); loops++; } // close off the collector coll.close(); coll2.close(); agent.close(); } /** * Test the DFO failure path with an unreliable subsink that has been made * decorated so that it will not give up trying to send. in this case, despite * the flakey rpc, the inisitentAppend and stubbornAppend make the sink * resilient, and never give up. */ @Test public void testDFOInsistentRetry() throws IOException, RuntimeException, FlumeSpecException, InterruptedException { long count = 100; // start the collectors first LogicalNode coll = setupColl(12345, "coll", "count"); LogicalNode coll2 = setupColl(12346, "coll2", "count2"); // Then the agent so it can connect. This config will attempt to send on the // primary, and when if fails goes to writing to disk. The subsink of // diskFailover is decorated to never return with an exception. String agentSink = "{ delay(100) => < " + "{ flakeyAppend(.05) => rpcSink(\"localhost\",12345) } ?" + " {diskFailover => { insistentAppend => { stubbornAppend => { insistentOpen " + "=> { lazyOpen => {flakeyAppend(.05) => rpcSink(\"localhost\",12346) } } } } } }> } "; LogicalNode agent = setupAgent(count, agentSink); // wait until the counts add up properly AccumulatorSink ctr = (AccumulatorSink) ReportManager.get().getReportable( "count"); AccumulatorSink ctr2 = (AccumulatorSink) ReportManager.get().getReportable( "count2"); loopUntilCount(count, coll, coll2); // close off the collector coll.close(); coll2.close(); // dump info for debugging Map<String, ReportEvent> rpts = new HashMap<String, ReportEvent>(); agent.getReports(rpts); for (Entry<String, ReportEvent> e : rpts.entrySet()) { LOG.info(e.getKey() + " : " + e.getValue()); } // check the end states assertEquals(count, ctr.getCount() + ctr2.getCount()); // the collector can be in ERROR or IDLE state because of the randomness. NodeState stateColl = coll.getStatus().state; LOG.info("coll exited in state: " + stateColl); assertTrue(stateColl.equals(NodeState.IDLE) || stateColl.equals(NodeState.ERROR)); NodeState stateColl2 = coll2.getStatus().state; LOG.info("coll2 exited in state: " + stateColl2); assertTrue(stateColl2.equals(NodeState.IDLE) || stateColl2.equals(NodeState.ERROR)); } /** * Test the DFO cases where collectors are not initially up, but show up later */ @Test public void testDFOCollectorsNotUp() throws IOException, RuntimeException, FlumeSpecException, InterruptedException { long count = 100; // Start the agent first. String agentSink = "{ delay(100) => < " + "{ flakeyAppend(.05) => rpcSink(\"localhost\",12345) } ?" + " {diskFailover => { insistentAppend => { stubbornAppend => { insistentOpen " + "=> { lazyOpen => {flakeyAppend(.05) => rpcSink(\"localhost\",12346) } } } } } }> } "; LogicalNode agent = setupAgent(count, agentSink); // Purposely sleep a little so that the agent is collecting to disk, then // start collectors Clock.sleep(2000); LogicalNode coll = setupColl(12345, "coll", "count"); LogicalNode coll2 = setupColl(12346, "coll2", "count2"); // wait until the counts add up properly AccumulatorSink ctr = (AccumulatorSink) ReportManager.get().getReportable( "count"); AccumulatorSink ctr2 = (AccumulatorSink) ReportManager.get().getReportable( "count2"); loopUntilCount(count, coll, coll2); // close off the collector coll.close(); coll2.close(); // dump info for debugging Map<String, ReportEvent> rpts = new HashMap<String, ReportEvent>(); agent.getReports(rpts); for (Entry<String, ReportEvent> e : rpts.entrySet()) { LOG.info(e.getKey() + " : " + e.getValue()); } // check the end states assertEquals(count, ctr.getCount() + ctr2.getCount()); assertTrue(ctr.getCount() > 0); assertTrue(ctr2.getCount() > 0); // the collector can be in ERROR or IDLE state because of the randomness. NodeState stateColl = coll.getStatus().state; LOG.info("coll exited in state: " + stateColl); assertTrue(stateColl.equals(NodeState.IDLE) || stateColl.equals(NodeState.ERROR)); NodeState stateColl2 = coll2.getStatus().state; LOG.info("coll2 exited in state: " + stateColl2); assertTrue(stateColl2.equals(NodeState.IDLE) || stateColl2.equals(NodeState.ERROR)); } /** * This tests the DFO case where the primary is never even started. */ @Test public void testDFOCollectors1NotUp() throws IOException, RuntimeException, FlumeSpecException, InterruptedException { long count = 100; // Start the agent first. String agentSink = "{ delay(100) => < " + "{ flakeyAppend(.05) => rpcSink(\"localhost\",12345) } ?" + " {diskFailover => { insistentAppend => { stubbornAppend => { insistentOpen " + "=> { lazyOpen => {flakeyAppend(.05) => rpcSink(\"localhost\",12346) } } } } } }> } "; LogicalNode agent = setupAgent(count, agentSink); // Purposely sleep a little so that the agent is collecting to disk, then // start collectors Clock.sleep(2000); LogicalNode coll2 = setupColl(12346, "coll2", "count2"); // wait until the counts add up properly AccumulatorSink ctr2 = (AccumulatorSink) ReportManager.get().getReportable( "count2"); loopUntilCount(count, null, coll2); // close off the collector coll2.close(); // dump info for debugging Map<String, ReportEvent> rpts = new HashMap<String, ReportEvent>(); agent.getReports(rpts); for (Entry<String, ReportEvent> e : rpts.entrySet()) { LOG.info(e.getKey() + " : " + e.getValue()); } // check the end states assertEquals(count, ctr2.getCount()); // the collector can be in ERROR or IDLE state because of the randomness.f NodeState stateColl2 = coll2.getStatus().state; LOG.info("coll2 exited in state: " + stateColl2); assertTrue(stateColl2.equals(NodeState.IDLE) || stateColl2.equals(NodeState.ERROR)); } }