/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.test.recovery;
import akka.actor.ActorRef;
import akka.actor.ActorSystem;
import akka.actor.Props;
import akka.actor.UntypedActor;
import akka.testkit.TestActorRef;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.filefilter.TrueFileFilter;
import org.apache.flink.api.common.ExecutionConfig;
import org.apache.flink.api.common.restartstrategy.RestartStrategies;
import org.apache.flink.configuration.ConfigConstants;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.runtime.akka.AkkaUtils;
import org.apache.flink.runtime.akka.ListeningBehaviour;
import org.apache.flink.runtime.clusterframework.types.ResourceID;
import org.apache.flink.runtime.highavailability.HighAvailabilityServices;
import org.apache.flink.runtime.highavailability.HighAvailabilityServicesUtils;
import org.apache.flink.runtime.instance.ActorGateway;
import org.apache.flink.runtime.instance.AkkaActorGateway;
import org.apache.flink.runtime.jobgraph.JobGraph;
import org.apache.flink.runtime.jobgraph.JobStatus;
import org.apache.flink.runtime.jobgraph.JobVertex;
import org.apache.flink.runtime.jobmanager.SubmittedJobGraph;
import org.apache.flink.runtime.leaderelection.TestingListener;
import org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService;
import org.apache.flink.runtime.messages.JobManagerMessages;
import org.apache.flink.runtime.messages.JobManagerMessages.LeaderSessionMessage;
import org.apache.flink.runtime.messages.JobManagerMessages.SubmitJob;
import org.apache.flink.runtime.taskmanager.TaskManager;
import org.apache.flink.runtime.testingUtils.TestingCluster;
import org.apache.flink.runtime.testingUtils.TestingUtils;
import org.apache.flink.runtime.testtasks.BlockingNoOpInvokable;
import org.apache.flink.runtime.testutils.CommonTestUtils;
import org.apache.flink.runtime.testutils.JobManagerActorTestUtils;
import org.apache.flink.runtime.testutils.JobManagerProcess;
import org.apache.flink.runtime.testutils.ZooKeeperTestUtils;
import org.apache.flink.runtime.zookeeper.ZooKeeperTestEnvironment;
import org.apache.flink.util.TestLogger;
import org.apache.zookeeper.data.Stat;
import org.junit.AfterClass;
import org.junit.Before;
import org.junit.Test;
import scala.Option;
import scala.Some;
import scala.Tuple2;
import scala.concurrent.Await;
import scala.concurrent.Future;
import scala.concurrent.duration.Deadline;
import scala.concurrent.duration.FiniteDuration;
import java.io.File;
import java.io.IOException;
import java.util.Collection;
import java.util.Queue;
import java.util.UUID;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.fail;
/**
* Tests recovery of {@link SubmittedJobGraph} instances.
*/
public class JobManagerHAJobGraphRecoveryITCase extends TestLogger {
private final static ZooKeeperTestEnvironment ZooKeeper = new ZooKeeperTestEnvironment(1);
private final static FiniteDuration TestTimeOut = new FiniteDuration(5, TimeUnit.MINUTES);
private static final File FileStateBackendBasePath;
static {
try {
FileStateBackendBasePath = CommonTestUtils.createTempDirectory();
}
catch (IOException e) {
throw new RuntimeException("Error in test setup. Could not create directory.", e);
}
}
@AfterClass
public static void tearDown() throws Exception {
ZooKeeper.shutdown();
if (FileStateBackendBasePath != null) {
FileUtils.deleteDirectory(FileStateBackendBasePath);
}
}
@Before
public void cleanUp() throws Exception {
if (FileStateBackendBasePath != null) {
FileUtils.cleanDirectory(FileStateBackendBasePath);
}
ZooKeeper.deleteAll();
}
// ---------------------------------------------------------------------------------------------
/**
* Tests that the HA job is not cleaned up when the jobmanager is stopped.
*/
@Test
public void testJobPersistencyWhenJobManagerShutdown() throws Exception {
Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(
ZooKeeper.getConnectString(), FileStateBackendBasePath.getPath());
// Configure the cluster
config.setInteger(ConfigConstants.LOCAL_NUMBER_JOB_MANAGER, 1);
config.setInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, 1);
TestingCluster flink = new TestingCluster(config, false, false);
try {
final Deadline deadline = TestTimeOut.fromNow();
// Start the JobManager and TaskManager
flink.start(true);
JobGraph jobGraph = createBlockingJobGraph();
// Set restart strategy to guard against shut down races.
// If the TM fails before the JM, it might happen that the
// Job is failed, leading to state removal.
ExecutionConfig ec = new ExecutionConfig();
ec.setRestartStrategy(RestartStrategies.fixedDelayRestart(Integer.MAX_VALUE, 100));
jobGraph.setExecutionConfig(ec);
ActorGateway jobManager = flink.getLeaderGateway(deadline.timeLeft());
// Submit the job
jobManager.tell(new SubmitJob(jobGraph, ListeningBehaviour.DETACHED));
// Wait for the job to start
JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING,
jobManager, deadline.timeLeft());
}
finally {
flink.shutdown();
}
// verify that the persisted job data has not been removed from ZooKeeper when the JM has
// been shutdown
verifyRecoveryState(config);
}
/**
* Tests that clients receive updates after recovery by a new leader.
*/
@Test
public void testClientNonDetachedListeningBehaviour() throws Exception {
Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(
ZooKeeper.getConnectString(), FileStateBackendBasePath.getPath());
// Test actor system
ActorSystem testSystem = null;
// JobManager setup. Start the job managers as separate processes in order to not run the
// actors postStop, which cleans up all running jobs.
JobManagerProcess[] jobManagerProcess = new JobManagerProcess[2];
LeaderRetrievalService leaderRetrievalService = null;
ActorSystem taskManagerSystem = null;
final HighAvailabilityServices highAvailabilityServices = HighAvailabilityServicesUtils.createHighAvailabilityServices(
config,
TestingUtils.defaultExecutor(),
HighAvailabilityServicesUtils.AddressResolution.NO_ADDRESS_RESOLUTION);
try {
final Deadline deadline = TestTimeOut.fromNow();
// Test actor system
testSystem = AkkaUtils.createActorSystem(new Configuration(),
new Some<>(new Tuple2<String, Object>("localhost", 0)));
// The job managers
jobManagerProcess[0] = new JobManagerProcess(0, config);
jobManagerProcess[1] = new JobManagerProcess(1, config);
jobManagerProcess[0].startProcess();
jobManagerProcess[1].startProcess();
// Leader listener
TestingListener leaderListener = new TestingListener();
leaderRetrievalService = highAvailabilityServices.getJobManagerLeaderRetriever(HighAvailabilityServices.DEFAULT_JOB_ID);
leaderRetrievalService.start(leaderListener);
// The task manager
taskManagerSystem = AkkaUtils.createActorSystem(AkkaUtils.getDefaultAkkaConfig());
TaskManager.startTaskManagerComponentsAndActor(
config,
ResourceID.generate(),
taskManagerSystem,
highAvailabilityServices,
"localhost",
Option.<String>empty(),
false,
TaskManager.class);
// Client test actor
TestActorRef<RecordingTestClient> clientRef = TestActorRef.create(
testSystem, Props.create(RecordingTestClient.class));
JobGraph jobGraph = createBlockingJobGraph();
{
// Initial submission
leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
String leaderAddress = leaderListener.getAddress();
UUID leaderId = leaderListener.getLeaderSessionID();
// The client
AkkaActorGateway client = new AkkaActorGateway(clientRef, leaderId);
// Get the leader ref
ActorRef leaderRef = AkkaUtils.getActorRef(
leaderAddress, testSystem, deadline.timeLeft());
ActorGateway leader = new AkkaActorGateway(leaderRef, leaderId);
int numSlots = 0;
while (numSlots == 0) {
Future<?> slotsFuture = leader.ask(JobManagerMessages
.getRequestTotalNumberOfSlots(), deadline.timeLeft());
numSlots = (Integer) Await.result(slotsFuture, deadline.timeLeft());
}
// Submit the job in non-detached mode
leader.tell(new SubmitJob(jobGraph,
ListeningBehaviour.EXECUTION_RESULT_AND_STATE_CHANGES), client);
JobManagerActorTestUtils.waitForJobStatus(
jobGraph.getJobID(), JobStatus.RUNNING, leader, deadline.timeLeft());
}
// Who's the boss?
JobManagerProcess leadingJobManagerProcess;
if (jobManagerProcess[0].getJobManagerAkkaURL(deadline.timeLeft()).equals(leaderListener.getAddress())) {
leadingJobManagerProcess = jobManagerProcess[0];
}
else {
leadingJobManagerProcess = jobManagerProcess[1];
}
// Kill the leading job manager process
leadingJobManagerProcess.destroy();
{
// Recovery by the standby JobManager
leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
String leaderAddress = leaderListener.getAddress();
UUID leaderId = leaderListener.getLeaderSessionID();
ActorRef leaderRef = AkkaUtils.getActorRef(
leaderAddress, testSystem, deadline.timeLeft());
ActorGateway leader = new AkkaActorGateway(leaderRef, leaderId);
JobManagerActorTestUtils.waitForJobStatus(
jobGraph.getJobID(), JobStatus.RUNNING, leader, deadline.timeLeft());
// Cancel the job
leader.tell(new JobManagerMessages.CancelJob(jobGraph.getJobID()));
}
// Wait for the execution result
clientRef.underlyingActor().awaitJobResult(deadline.timeLeft().toMillis());
int jobSubmitSuccessMessages = 0;
for (Object msg : clientRef.underlyingActor().getMessages()) {
if (msg instanceof JobManagerMessages.JobSubmitSuccess) {
jobSubmitSuccessMessages++;
}
}
// At least two submissions should be ack-ed (initial and recovery). This is quite
// conservative, but it is still possible that these messages are overtaken by the
// final message.
assertEquals(2, jobSubmitSuccessMessages);
}
catch (Throwable t) {
// Print early (in some situations the process logs get too big
// for Travis and the root problem is not shown)
t.printStackTrace();
// In case of an error, print the job manager process logs.
if (jobManagerProcess[0] != null) {
jobManagerProcess[0].printProcessLog();
}
if (jobManagerProcess[1] != null) {
jobManagerProcess[1].printProcessLog();
}
throw t;
}
finally {
if (jobManagerProcess[0] != null) {
jobManagerProcess[0].destroy();
}
if (jobManagerProcess[1] != null) {
jobManagerProcess[1].destroy();
}
if (leaderRetrievalService != null) {
leaderRetrievalService.stop();
}
if (taskManagerSystem != null) {
taskManagerSystem.shutdown();
}
if (testSystem != null) {
testSystem.shutdown();
}
highAvailabilityServices.closeAndCleanupAllData();
}
}
/**
* Simple recording client.
*/
private static class RecordingTestClient extends UntypedActor {
private final Queue<Object> messages = new ConcurrentLinkedQueue<>();
private CountDownLatch jobResultLatch = new CountDownLatch(1);
@Override
public void onReceive(Object message) throws Exception {
if (message instanceof LeaderSessionMessage) {
message = ((LeaderSessionMessage) message).message();
}
messages.add(message);
// Check for job result
if (message instanceof JobManagerMessages.JobResultFailure ||
message instanceof JobManagerMessages.JobResultSuccess) {
jobResultLatch.countDown();
}
}
public Queue<Object> getMessages() {
return messages;
}
public void awaitJobResult(long timeout) throws InterruptedException {
jobResultLatch.await(timeout, TimeUnit.MILLISECONDS);
}
}
// ---------------------------------------------------------------------------------------------
/**
* Creates a simple blocking JobGraph.
*/
private static JobGraph createBlockingJobGraph() {
JobGraph jobGraph = new JobGraph("Blocking program");
JobVertex jobVertex = new JobVertex("Blocking Vertex");
jobVertex.setInvokableClass(BlockingNoOpInvokable.class);
jobGraph.addVertex(jobVertex);
return jobGraph;
}
/**
* Fails the test if the recovery state (file state backend and ZooKeeper) is not clean.
*/
private static void verifyCleanRecoveryState(Configuration config) throws Exception {
// File state backend empty
Collection<File> stateHandles = FileUtils.listFiles(
FileStateBackendBasePath, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE);
if (!stateHandles.isEmpty()) {
fail("File state backend is not clean: " + stateHandles);
}
// ZooKeeper
String currentJobsPath = config.getString(
ConfigConstants.HA_ZOOKEEPER_JOBGRAPHS_PATH,
ConfigConstants.DEFAULT_ZOOKEEPER_JOBGRAPHS_PATH);
Stat stat = ZooKeeper.getClient().checkExists().forPath(currentJobsPath);
if (stat.getCversion() == 0) {
// Sanity check: verify that some changes have been performed
fail("ZooKeeper state for '" + currentJobsPath + "' has not been modified during " +
"this test. What are you testing?");
}
if (stat.getNumChildren() != 0) {
// Is everything clean again?
fail("ZooKeeper path '" + currentJobsPath + "' is not clean: " +
ZooKeeper.getClient().getChildren().forPath(currentJobsPath));
}
}
/**
* Fails the test if the recovery state (file state backend and ZooKeeper) has been cleaned.
*/
private static void verifyRecoveryState(Configuration config) throws Exception {
// File state backend empty
Collection<File> stateHandles = FileUtils.listFiles(
FileStateBackendBasePath, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE);
if (stateHandles.isEmpty()) {
fail("File state backend has been cleaned: " + stateHandles);
}
// ZooKeeper
String currentJobsPath = config.getString(
ConfigConstants.HA_ZOOKEEPER_JOBGRAPHS_PATH,
ConfigConstants.DEFAULT_ZOOKEEPER_JOBGRAPHS_PATH);
Stat stat = ZooKeeper.getClient().checkExists().forPath(currentJobsPath);
if (stat.getCversion() == 0) {
// Sanity check: verify that some changes have been performed
fail("ZooKeeper state for '" + currentJobsPath + "' has not been modified during " +
"this test. What are you testing?");
}
if (stat.getNumChildren() == 0) {
// Children have been cleaned up?
fail("ZooKeeper path '" + currentJobsPath + "' has been cleaned: " +
ZooKeeper.getClient().getChildren().forPath(currentJobsPath));
}
}
}