/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package gobblin.cluster; import com.google.common.base.Optional; import com.google.common.base.Predicate; import com.typesafe.config.Config; import com.typesafe.config.ConfigFactory; import com.typesafe.config.ConfigValueFactory; import gobblin.testing.AssertWithBackoff; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.net.URL; import java.util.Collection; import java.util.Collections; import java.util.concurrent.TimeoutException; import org.apache.commons.io.FileUtils; import org.apache.curator.test.TestingServer; import org.apache.hadoop.fs.Path; import org.apache.helix.HelixManager; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.testng.Assert; import org.testng.annotations.AfterClass; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; /** * Unit tests for killing {@link GobblinClusterManager}s and {@link GobblinTaskRunner}s * * <p> * This class uses a {@link TestingServer} as an embedded ZooKeeper server for testing. The Curator * framework is used to provide a ZooKeeper client. This class also uses the {@link HelixManager} to * act as a testing Helix participant to receive the container (running the {@link GobblinTaskRunner}) * shutdown request message. * </p> */ @Test(groups = { "gobblin.cluster" }, singleThreaded = true) public class GobblinClusterKillTest { public final static Logger LOG = LoggerFactory.getLogger(GobblinClusterKillTest.class); private TestingServer _testingZKServer; private final static int ASSERT_TIMEOUT = 60000; private final static int ASSERT_MAX_SLEEP = 2000; private final static int NUM_MANAGERS = 2; private final static int NUM_WORKERS = 2; private GobblinClusterManager[] _clusterManagers; private GobblinTaskRunner[] _clusterWorkers; private Thread[] _workerStartThreads; private String _testDirPath; private String _jobDirPath; Config _config; /** * clean up and set up test directory */ private void setupTestDir() throws IOException { _testDirPath = _config.getString("gobblin.cluster.work.dir"); _jobDirPath = _config.getString(GobblinClusterConfigurationKeys.JOB_CONF_PATH_KEY); // clean up test directory and create job dir File testDir = new File(_testDirPath); File jobDir = new File(_jobDirPath); if (testDir.exists()) { FileUtils.deleteDirectory(testDir); } jobDir.mkdirs(); // copy job file from resource String jobFileName = GobblinClusterKillTest.class.getSimpleName() + "Job.conf"; try (InputStream resourceStream = this.getClass().getClassLoader().getResourceAsStream(jobFileName)) { if (resourceStream == null) { throw new RuntimeException("Could not find job resource " + jobFileName); } File targetFile = new File(_jobDirPath + "/" + jobFileName); FileUtils.copyInputStreamToFile(resourceStream, targetFile); } catch (IOException e) { throw new RuntimeException("Unable to load job resource " + jobFileName, e); } } /** * Create and start a cluster manager * @param id - array offset * @throws Exception */ private void setupManager(int id) throws Exception { _clusterManagers[id] = new GobblinClusterManager(TestHelper.TEST_APPLICATION_NAME, TestHelper.TEST_APPLICATION_ID, _config.withValue(GobblinClusterConfigurationKeys.HELIX_INSTANCE_NAME_KEY, ConfigValueFactory.fromAnyRef("Manager_" + id)), Optional.of(new Path(_config.getString("gobblin.cluster.work.dir")))); _clusterManagers[id].start(); } /** * Create and start a cluster worker * @param id - array offset * @throws Exception */ private void setupWorker(int id) throws Exception { final GobblinTaskRunner fworker = new GobblinTaskRunner(TestHelper.TEST_APPLICATION_NAME, "Worker_" + id, TestHelper.TEST_APPLICATION_ID, "1", _config, Optional.of(new Path(_config.getString("gobblin.cluster.work.dir")))); _clusterWorkers[id] = fworker; _workerStartThreads[id] = new Thread(new Runnable() { @Override public void run() { fworker.start(); } }); _workerStartThreads[id].start(); } @BeforeClass public void setUp() throws Exception { // Use a random ZK port _testingZKServer = new TestingServer(-1); LOG.info("Testing ZK Server listening on: " + _testingZKServer.getConnectString()); URL url = GobblinClusterKillTest.class.getClassLoader().getResource( GobblinClusterKillTest.class.getSimpleName() + ".conf"); Assert.assertNotNull(url, "Could not find resource " + url); _config = ConfigFactory.parseURL(url) .withValue("gobblin.cluster.zk.connection.string", ConfigValueFactory.fromAnyRef(_testingZKServer.getConnectString())) .withValue("gobblin.cluster.jobconf.fullyQualifiedPath", ConfigValueFactory.fromAnyRef("/tmp/gobblinClusterKillTest/job-conf")) .resolve(); String zkConnectionString = _config.getString(GobblinClusterConfigurationKeys.ZK_CONNECTION_STRING_KEY); HelixUtils.createGobblinHelixCluster(zkConnectionString, _config.getString(GobblinClusterConfigurationKeys.HELIX_CLUSTER_NAME_KEY)); setupTestDir(); _clusterManagers = new GobblinClusterManager[NUM_MANAGERS]; _clusterWorkers = new GobblinTaskRunner[NUM_WORKERS]; _workerStartThreads = new Thread[NUM_WORKERS]; for (int i = 0; i < NUM_MANAGERS; i++) { setupManager(i); } for (int i = 0; i < NUM_WORKERS; i++) { setupWorker(i); } } // The kill tests are unreliable on Travis @Test(groups = { "disabledOnTravis" }) public void testKillWorker() throws TimeoutException, InterruptedException { Collection<File> matches = Collections.EMPTY_LIST; final File writerOutputDir = new File(_testDirPath + "/writer-output/gobblin/util/test/HelloWorldSource/"); final File jobOutputDir = new File(_testDirPath + "/job-output/gobblin/util/test/HelloWorldSource/"); final File testJobFile = new File(_jobDirPath + "/GobblinClusterKillTestJob.conf"); // Job file should exist Assert.assertTrue(testJobFile.exists()); AssertWithBackoff.create().logger(LOG).timeoutMs(ASSERT_TIMEOUT).maxSleepMs(ASSERT_MAX_SLEEP).backoffFactor(1.5) .assertTrue(new Predicate<Void>() { @Override public boolean apply(Void input) { if (writerOutputDir.exists()) { return FileUtils.listFiles(writerOutputDir, new String[]{"txt"}, true).size() >= 25; } else { return false; } } }, "Waiting for writer output"); LOG.info("{} matches found before disconnecting worker", FileUtils.listFiles(writerOutputDir, new String[]{"txt"}, true).size()); _clusterWorkers[0].disconnectHelixManager(); AssertWithBackoff.create().logger(LOG).timeoutMs(ASSERT_TIMEOUT).maxSleepMs(ASSERT_MAX_SLEEP).backoffFactor(1.5) .assertTrue(new Predicate<Void>() { @Override public boolean apply(Void input) { if (jobOutputDir.exists()) { return FileUtils.listFiles(jobOutputDir, new String[]{"txt"}, true).size() >= 100; } else { return false; } } }, "Waiting for job-completion"); // Job file should have been deleted Thread.sleep(5000); Assert.assertFalse(testJobFile.exists()); } // The kill tests are unreliable on Travis @Test(groups = { "disabledOnTravis" }, dependsOnMethods = "testKillWorker") public void testKillManager() throws IOException, TimeoutException, InterruptedException { Collection<File> matches = Collections.EMPTY_LIST; final File writerOutputDir = new File(_testDirPath + "/writer-output/gobblin/util/test/HelloWorldSource/"); final File jobOutputDir = new File(_testDirPath + "/job-output/gobblin/util/test/HelloWorldSource/"); // reinitialize test directory setupTestDir(); // kill a manager to cause leader election. New leader will schedule a new job. _clusterManagers[0].disconnectHelixManager(); AssertWithBackoff.create().logger(LOG).timeoutMs(ASSERT_TIMEOUT).maxSleepMs(ASSERT_MAX_SLEEP).backoffFactor(1.5) .assertTrue(new Predicate<Void>() { @Override public boolean apply(Void input) { if (writerOutputDir.exists()) { return FileUtils.listFiles(writerOutputDir, new String[]{"txt"}, true).size() >= 25; } else { return false; } } }, "Waiting for writer output"); AssertWithBackoff.create().logger(LOG).timeoutMs(ASSERT_TIMEOUT).maxSleepMs(ASSERT_MAX_SLEEP).backoffFactor(1.5) .assertTrue(new Predicate<Void>() { @Override public boolean apply(Void input) { if (jobOutputDir.exists()) { return FileUtils.listFiles(jobOutputDir, new String[]{"txt"}, true).size() >= 100; } else { return false; } } }, "Waiting for job-completion"); // Job file should have been deleted Thread.sleep(5000); final File testJobFile = new File(_jobDirPath + "/GobblinClusterKillTestJob.conf"); Assert.assertFalse(testJobFile.exists()); } // The kill tests are unreliable on Travis @Test(groups = { "disabledOnTravis" }, enabled=true, dependsOnMethods = "testKillManager") public void testRestartManager() throws IOException, TimeoutException, InterruptedException { Collection<File> matches = Collections.EMPTY_LIST; final File writerOutputDir = new File(_testDirPath + "/writer-output/gobblin/util/test/HelloWorldSource/"); final File jobOutputDir = new File(_testDirPath + "/job-output/gobblin/util/test/HelloWorldSource/"); // reinitialize test directory setupTestDir(); // At this point there is one connected manager. Disconnect it and reconnect the other one to confirm that a manager // can continue to function after regaining leadership. _clusterManagers[1].disconnectHelixManager(); // Should function after regaining leadership // need to reinitialize the heap manager and call handleLeadershipChange to shut down services in the test // since the leadership change is simulated _clusterManagers[0].initializeHelixManager(); _clusterManagers[0].handleLeadershipChange(null); // reconnect to get leadership role _clusterManagers[0].connectHelixManager(); AssertWithBackoff.create().logger(LOG).timeoutMs(ASSERT_TIMEOUT).maxSleepMs(ASSERT_MAX_SLEEP).backoffFactor(1.5) .assertTrue(new Predicate<Void>() { @Override public boolean apply(Void input) { if (writerOutputDir.exists()) { return FileUtils.listFiles(writerOutputDir, new String[]{"txt"}, true).size() >= 25; } else { return false; } } }, "Waiting for writer output"); AssertWithBackoff.create().logger(LOG).timeoutMs(ASSERT_TIMEOUT).maxSleepMs(ASSERT_MAX_SLEEP).backoffFactor(1.5) .assertTrue(new Predicate<Void>() { @Override public boolean apply(Void input) { if (jobOutputDir.exists()) { return FileUtils.listFiles(jobOutputDir, new String[]{"txt"}, true).size() >= 100; } else { return false; } } }, "Waiting for job-completion"); } @AfterClass public void tearDown() throws IOException, InterruptedException { for (int i = 0; i < NUM_MANAGERS; i++) { _clusterManagers[i].connectHelixManager(); if (!_clusterManagers[i].isHelixManagerConnected()) { _clusterManagers[i].connectHelixManager(); } _clusterManagers[i].stop(); } for (int i = 0; i < NUM_WORKERS; i++) { _clusterWorkers[i].stop(); } for (int i = 0; i < NUM_WORKERS; i++) { _workerStartThreads[i].join(); } _testingZKServer.close(); } }