/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.test.recovery;
import akka.actor.ActorRef;
import akka.actor.ActorSystem;
import akka.pattern.Patterns;
import akka.util.Timeout;
import org.apache.commons.io.FileUtils;
import org.apache.flink.configuration.ConfigConstants;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.configuration.JobManagerOptions;
import org.apache.flink.configuration.TaskManagerOptions;
import org.apache.flink.runtime.akka.AkkaUtils;
import org.apache.flink.runtime.clusterframework.types.ResourceID;
import org.apache.flink.runtime.highavailability.HighAvailabilityServices;
import org.apache.flink.runtime.highavailability.HighAvailabilityServicesUtils;
import org.apache.flink.runtime.jobmanager.JobManager;
import org.apache.flink.runtime.jobmanager.MemoryArchivist;
import org.apache.flink.runtime.messages.JobManagerMessages;
import org.apache.flink.runtime.taskmanager.TaskManager;
import org.apache.flink.runtime.testingUtils.TestingUtils;
import org.apache.flink.runtime.testutils.CommonTestUtils;
import org.apache.flink.util.NetUtils;
import org.apache.flink.util.TestLogger;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.Some;
import scala.Tuple2;
import scala.concurrent.Await;
import scala.concurrent.Future;
import scala.concurrent.duration.FiniteDuration;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.StringWriter;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicReference;
import static org.apache.flink.runtime.testutils.CommonTestUtils.getCurrentClasspath;
import static org.apache.flink.runtime.testutils.CommonTestUtils.getJavaCommandPath;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.fail;
/**
* Abstract base for tests verifying the behavior of the recovery in the
* case when a TaskManager fails (process is killed) in the middle of a job execution.
*
* The test works with multiple task managers processes by spawning JVMs.
* Initially, it starts a JobManager in process and two TaskManagers JVMs with
* 2 task slots each.
* It submits a program with parallelism 4 and waits until all tasks are brought up.
* Coordination between the test and the tasks happens via checking for the
* existence of temporary files. It then starts another TaskManager, which is
* guaranteed to remain empty (all tasks are already deployed) and kills one of
* the original task managers. The recovery should restart the tasks on the new TaskManager.
*/
public abstract class AbstractTaskManagerProcessFailureRecoveryTest extends TestLogger {
protected final Logger LOG = LoggerFactory.getLogger(getClass());
protected static final String READY_MARKER_FILE_PREFIX = "ready_";
protected static final String PROCEED_MARKER_FILE = "proceed";
protected static final String FINISH_MARKER_FILE_PREFIX = "finish_";
protected static final int PARALLELISM = 4;
@Test
public void testTaskManagerProcessFailure() throws Exception {
final StringWriter processOutput1 = new StringWriter();
final StringWriter processOutput2 = new StringWriter();
final StringWriter processOutput3 = new StringWriter();
ActorSystem jmActorSystem = null;
HighAvailabilityServices highAvailabilityServices = null;
Process taskManagerProcess1 = null;
Process taskManagerProcess2 = null;
Process taskManagerProcess3 = null;
File coordinateTempDir = null;
try {
// check that we run this test only if the java command
// is available on this machine
String javaCommand = getJavaCommandPath();
if (javaCommand == null) {
System.out.println("---- Skipping Process Failure test : Could not find java executable ----");
return;
}
// create a logging file for the process
File tempLogFile = File.createTempFile(getClass().getSimpleName() + "-", "-log4j.properties");
tempLogFile.deleteOnExit();
CommonTestUtils.printLog4jDebugConfig(tempLogFile);
// coordination between the processes goes through a directory
coordinateTempDir = CommonTestUtils.createTempDirectory();
// find a free port to start the JobManager
final int jobManagerPort = NetUtils.getAvailablePort();
// start a JobManager
Tuple2<String, Object> localAddress = new Tuple2<String, Object>("localhost", jobManagerPort);
Configuration jmConfig = new Configuration();
jmConfig.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_INTERVAL, "1000 ms");
jmConfig.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_PAUSE, "6 s");
jmConfig.setInteger(ConfigConstants.AKKA_WATCH_THRESHOLD, 9);
jmConfig.setString(ConfigConstants.RESTART_STRATEGY_FIXED_DELAY_DELAY, "10 s");
jmConfig.setString(ConfigConstants.AKKA_ASK_TIMEOUT, "100 s");
jmConfig.setString(JobManagerOptions.ADDRESS, localAddress._1());
jmConfig.setInteger(JobManagerOptions.PORT, jobManagerPort);
highAvailabilityServices = HighAvailabilityServicesUtils.createHighAvailabilityServices(
jmConfig,
TestingUtils.defaultExecutor(),
HighAvailabilityServicesUtils.AddressResolution.NO_ADDRESS_RESOLUTION);
jmActorSystem = AkkaUtils.createActorSystem(jmConfig, new Some<>(localAddress));
ActorRef jmActor = JobManager.startJobManagerActors(
jmConfig,
jmActorSystem,
TestingUtils.defaultExecutor(),
TestingUtils.defaultExecutor(),
highAvailabilityServices,
JobManager.class,
MemoryArchivist.class)._1();
// the TaskManager java command
String[] command = new String[] {
javaCommand,
"-Dlog.level=DEBUG",
"-Dlog4j.configuration=file:" + tempLogFile.getAbsolutePath(),
"-Xms80m", "-Xmx80m",
"-classpath", getCurrentClasspath(),
TaskManagerProcessEntryPoint.class.getName(),
String.valueOf(jobManagerPort)
};
// start the first two TaskManager processes
taskManagerProcess1 = new ProcessBuilder(command).start();
new CommonTestUtils.PipeForwarder(taskManagerProcess1.getErrorStream(), processOutput1);
taskManagerProcess2 = new ProcessBuilder(command).start();
new CommonTestUtils.PipeForwarder(taskManagerProcess2.getErrorStream(), processOutput2);
// we wait for the JobManager to have the two TaskManagers available
// since some of the CI environments are very hostile, we need to give this a lot of time (2 minutes)
waitUntilNumTaskManagersAreRegistered(jmActor, 2, 120000);
// the program will set a marker file in each of its parallel tasks once they are ready, so that
// this coordinating code is aware of this.
// the program will very slowly consume elements until the marker file (later created by the
// test driver code) is present
final File coordinateDirClosure = coordinateTempDir;
final AtomicReference<Throwable> errorRef = new AtomicReference<>();
// we trigger program execution in a separate thread
Thread programTrigger = new Thread("Program Trigger") {
@Override
public void run() {
try {
testTaskManagerFailure(jobManagerPort, coordinateDirClosure);
}
catch (Throwable t) {
t.printStackTrace();
errorRef.set(t);
}
}
};
//start the test program
programTrigger.start();
// wait until all marker files are in place, indicating that all tasks have started
// max 20 seconds
if (!waitForMarkerFiles(coordinateTempDir, READY_MARKER_FILE_PREFIX, PARALLELISM, 120000)) {
// check if the program failed for some reason
if (errorRef.get() != null) {
Throwable error = errorRef.get();
error.printStackTrace();
fail("The program encountered a " + error.getClass().getSimpleName() + " : " + error.getMessage());
}
else {
// no error occurred, simply a timeout
fail("The tasks were not started within time (" + 120000 + "msecs)");
}
}
// start the third TaskManager
taskManagerProcess3 = new ProcessBuilder(command).start();
new CommonTestUtils.PipeForwarder(taskManagerProcess3.getErrorStream(), processOutput3);
// we wait for the third TaskManager to register
// since some of the CI environments are very hostile, we need to give this a lot of time (2 minutes)
waitUntilNumTaskManagersAreRegistered(jmActor, 3, 120000);
// kill one of the previous TaskManagers, triggering a failure and recovery
taskManagerProcess1.destroy();
taskManagerProcess1 = null;
// we create the marker file which signals the program functions tasks that they can complete
touchFile(new File(coordinateTempDir, PROCEED_MARKER_FILE));
// wait for at most 5 minutes for the program to complete
programTrigger.join(300000);
// check that the program really finished
assertFalse("The program did not finish in time", programTrigger.isAlive());
// check whether the program encountered an error
if (errorRef.get() != null) {
Throwable error = errorRef.get();
error.printStackTrace();
fail("The program encountered a " + error.getClass().getSimpleName() + " : " + error.getMessage());
}
// all seems well :-)
}
catch (Exception e) {
e.printStackTrace();
printProcessLog("TaskManager 1", processOutput1.toString());
printProcessLog("TaskManager 2", processOutput2.toString());
printProcessLog("TaskManager 3", processOutput3.toString());
fail(e.getMessage());
}
catch (Error e) {
e.printStackTrace();
printProcessLog("TaskManager 1", processOutput1.toString());
printProcessLog("TaskManager 2", processOutput2.toString());
printProcessLog("TaskManager 3", processOutput3.toString());
throw e;
}
finally {
if (taskManagerProcess1 != null) {
taskManagerProcess1.destroy();
}
if (taskManagerProcess2 != null) {
taskManagerProcess2.destroy();
}
if (taskManagerProcess3 != null) {
taskManagerProcess3.destroy();
}
if (jmActorSystem != null) {
jmActorSystem.shutdown();
}
if (coordinateTempDir != null) {
try {
FileUtils.deleteDirectory(coordinateTempDir);
}
catch (Throwable t) {
// we can ignore this
}
}
if (highAvailabilityServices != null) {
highAvailabilityServices.closeAndCleanupAllData();
}
}
}
/**
* The test program should be implemented here in a form of a separate thread.
* This provides a solution for checking that it has been terminated.
*
* @param jobManagerPort The port for submitting the topology to the local cluster
* @param coordinateDir TaskManager failure will be triggered only after processes
* have successfully created file under this directory
*/
public abstract void testTaskManagerFailure(int jobManagerPort, File coordinateDir) throws Exception;
protected void waitUntilNumTaskManagersAreRegistered(ActorRef jobManager, int numExpected, long maxDelayMillis)
throws Exception
{
final long pollInterval = 10_000_000; // 10 ms = 10,000,000 nanos
final long deadline = System.nanoTime() + maxDelayMillis * 1_000_000;
long time;
while ((time = System.nanoTime()) < deadline) {
FiniteDuration timeout = new FiniteDuration(pollInterval, TimeUnit.NANOSECONDS);
try {
Future<?> result = Patterns.ask(jobManager,
JobManagerMessages.getRequestNumberRegisteredTaskManager(),
new Timeout(timeout));
int numTMs = (Integer) Await.result(result, timeout);
if (numTMs == numExpected) {
return;
}
}
catch (TimeoutException e) {
// ignore and retry
}
catch (ClassCastException e) {
fail("Wrong response: " + e.getMessage());
}
long timePassed = System.nanoTime() - time;
long remainingMillis = (pollInterval - timePassed) / 1_000_000;
if (remainingMillis > 0) {
Thread.sleep(remainingMillis);
}
}
fail("The TaskManagers did not register within the expected time (" + maxDelayMillis + "msecs)");
}
protected static void printProcessLog(String processName, String log) {
if (log == null || log.length() == 0) {
return;
}
System.out.println("-----------------------------------------");
System.out.println(" BEGIN SPAWNED PROCESS LOG FOR " + processName);
System.out.println("-----------------------------------------");
System.out.println(log);
System.out.println("-----------------------------------------");
System.out.println(" END SPAWNED PROCESS LOG");
System.out.println("-----------------------------------------");
}
protected static void touchFile(File file) throws IOException {
if (!file.exists()) {
new FileOutputStream(file).close();
}
if (!file.setLastModified(System.currentTimeMillis())) {
throw new IOException("Could not touch the file.");
}
}
protected static boolean waitForMarkerFiles(File basedir, String prefix, int num, long timeout) {
long now = System.currentTimeMillis();
final long deadline = now + timeout;
while (now < deadline) {
boolean allFound = true;
for (int i = 0; i < num; i++) {
File nextToCheck = new File(basedir, prefix + i);
if (!nextToCheck.exists()) {
allFound = false;
break;
}
}
if (allFound) {
return true;
}
else {
// not all found, wait for a bit
try {
Thread.sleep(10);
}
catch (InterruptedException e) {
throw new RuntimeException(e);
}
now = System.currentTimeMillis();
}
}
return false;
}
// --------------------------------------------------------------------------------------------
/**
* The entry point for the TaskManager JVM. Simply configures and runs a TaskManager.
*/
public static class TaskManagerProcessEntryPoint {
private static final Logger LOG = LoggerFactory.getLogger(TaskManagerProcessEntryPoint.class);
public static void main(String[] args) {
try {
int jobManagerPort = Integer.parseInt(args[0]);
Configuration cfg = new Configuration();
cfg.setString(ConfigConstants.JOB_MANAGER_IPC_ADDRESS_KEY, "localhost");
cfg.setInteger(ConfigConstants.JOB_MANAGER_IPC_PORT_KEY, jobManagerPort);
cfg.setLong(TaskManagerOptions.MANAGED_MEMORY_SIZE, 4L);
cfg.setInteger(TaskManagerOptions.NETWORK_NUM_BUFFERS, 100);
cfg.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, 2);
cfg.setString(ConfigConstants.AKKA_ASK_TIMEOUT, "100 s");
TaskManager.selectNetworkInterfaceAndRunTaskManager(cfg,
ResourceID.generate(), TaskManager.class);
// wait forever
Object lock = new Object();
synchronized (lock) {
lock.wait();
}
}
catch (Throwable t) {
LOG.error("Failed to start TaskManager process", t);
System.exit(1);
}
}
}
}