/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.test.recovery;
import akka.actor.ActorRef;
import akka.actor.ActorSystem;
import akka.pattern.Patterns;
import akka.util.Timeout;
import org.apache.flink.api.common.JobID;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.restartstrategy.RestartStrategies;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.io.DiscardingOutputFormat;
import org.apache.flink.client.program.ProgramInvocationException;
import org.apache.flink.configuration.ConfigConstants;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.configuration.JobManagerOptions;
import org.apache.flink.runtime.akka.AkkaUtils;
import org.apache.flink.runtime.client.JobStatusMessage;
import org.apache.flink.runtime.highavailability.HighAvailabilityServices;
import org.apache.flink.runtime.highavailability.HighAvailabilityServicesUtils;
import org.apache.flink.runtime.jobmanager.JobManager;
import org.apache.flink.runtime.jobmanager.MemoryArchivist;
import org.apache.flink.runtime.messages.JobManagerMessages;
import org.apache.flink.runtime.testingUtils.TestingUtils;
import org.apache.flink.runtime.testutils.CommonTestUtils;
import org.apache.flink.util.NetUtils;
import org.apache.flink.util.TestLogger;
import org.junit.Test;
import scala.Some;
import scala.Tuple2;
import scala.concurrent.Await;
import scala.concurrent.Future;
import scala.concurrent.duration.FiniteDuration;
import java.io.File;
import java.io.StringWriter;
import java.util.List;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import static org.apache.flink.runtime.testutils.CommonTestUtils.getCurrentClasspath;
import static org.apache.flink.runtime.testutils.CommonTestUtils.getJavaCommandPath;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
/**
* This test makes sure that jobs are canceled properly in cases where
* the task manager went down and did not respond to cancel messages.
*/
@SuppressWarnings("serial")
public class ProcessFailureCancelingITCase extends TestLogger {
@Test
public void testCancelingOnProcessFailure() throws Exception {
final StringWriter processOutput = new StringWriter();
ActorSystem jmActorSystem = null;
Process taskManagerProcess = null;
HighAvailabilityServices highAvailabilityServices = null;
try {
// check that we run this test only if the java command
// is available on this machine
String javaCommand = getJavaCommandPath();
if (javaCommand == null) {
System.out.println("---- Skipping Process Failure test : Could not find java executable ----");
return;
}
// create a logging file for the process
File tempLogFile = File.createTempFile(getClass().getSimpleName() + "-", "-log4j.properties");
tempLogFile.deleteOnExit();
CommonTestUtils.printLog4jDebugConfig(tempLogFile);
// find a free port to start the JobManager
final int jobManagerPort = NetUtils.getAvailablePort();
// start a JobManager
Tuple2<String, Object> localAddress = new Tuple2<String, Object>("localhost", jobManagerPort);
Configuration jmConfig = new Configuration();
jmConfig.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_INTERVAL, "5 s");
jmConfig.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_PAUSE, "2000 s");
jmConfig.setInteger(ConfigConstants.AKKA_WATCH_THRESHOLD, 10);
jmConfig.setString(ConfigConstants.AKKA_ASK_TIMEOUT, "100 s");
jmConfig.setString(JobManagerOptions.ADDRESS, localAddress._1());
jmConfig.setInteger(JobManagerOptions.PORT, jobManagerPort);
highAvailabilityServices = HighAvailabilityServicesUtils.createHighAvailabilityServices(
jmConfig,
TestingUtils.defaultExecutor(),
HighAvailabilityServicesUtils.AddressResolution.NO_ADDRESS_RESOLUTION);
jmActorSystem = AkkaUtils.createActorSystem(jmConfig, new Some<>(localAddress));
ActorRef jmActor = JobManager.startJobManagerActors(
jmConfig,
jmActorSystem,
TestingUtils.defaultExecutor(),
TestingUtils.defaultExecutor(),
highAvailabilityServices,
JobManager.class,
MemoryArchivist.class)._1();
// the TaskManager java command
String[] command = new String[] {
javaCommand,
"-Dlog.level=DEBUG",
"-Dlog4j.configuration=file:" + tempLogFile.getAbsolutePath(),
"-Xms80m", "-Xmx80m",
"-classpath", getCurrentClasspath(),
AbstractTaskManagerProcessFailureRecoveryTest.TaskManagerProcessEntryPoint.class.getName(),
String.valueOf(jobManagerPort)
};
// start the first two TaskManager processes
taskManagerProcess = new ProcessBuilder(command).start();
new CommonTestUtils.PipeForwarder(taskManagerProcess.getErrorStream(), processOutput);
// we wait for the JobManager to have the two TaskManagers available
// since some of the CI environments are very hostile, we need to give this a lot of time (2 minutes)
waitUntilNumTaskManagersAreRegistered(jmActor, 1, 120000);
final Throwable[] errorRef = new Throwable[1];
// start the test program, which infinitely blocks
Runnable programRunner = new Runnable() {
@Override
public void run() {
try {
ExecutionEnvironment env = ExecutionEnvironment.createRemoteEnvironment("localhost", jobManagerPort);
env.setParallelism(2);
env.setRestartStrategy(RestartStrategies.noRestart());
env.getConfig().disableSysoutLogging();
env.generateSequence(0, Long.MAX_VALUE)
.map(new MapFunction<Long, Long>() {
@Override
public Long map(Long value) throws Exception {
synchronized (this) {
wait();
}
return 0L;
}
})
.output(new DiscardingOutputFormat<Long>());
env.execute();
}
catch (Throwable t) {
errorRef[0] = t;
}
}
};
Thread programThread = new Thread(programRunner);
// kill the TaskManager
taskManagerProcess.destroy();
taskManagerProcess = null;
// immediately submit the job. this should hit the case
// where the JobManager still thinks it has the TaskManager and tries to send it tasks
programThread.start();
// try to cancel the job
cancelRunningJob(jmActor);
// we should see a failure within reasonable time (10s is the ask timeout).
// since the CI environment is often slow, we conservatively give it up to 2 minutes,
// to fail, which is much lower than the failure time given by the heartbeats ( > 2000s)
programThread.join(120000);
assertFalse("The program did not cancel in time (2 minutes)", programThread.isAlive());
Throwable error = errorRef[0];
assertNotNull("The program did not fail properly", error);
assertTrue(error instanceof ProgramInvocationException);
// all seems well :-)
}
catch (Exception e) {
printProcessLog("TaskManager", processOutput.toString());
throw e;
}
catch (Error e) {
printProcessLog("TaskManager 1", processOutput.toString());
throw e;
}
finally {
if (taskManagerProcess != null) {
taskManagerProcess.destroy();
}
if (jmActorSystem != null) {
jmActorSystem.shutdown();
}
if (highAvailabilityServices != null) {
highAvailabilityServices.closeAndCleanupAllData();
}
}
}
private void cancelRunningJob(ActorRef jobManager) throws Exception {
final FiniteDuration askTimeout = new FiniteDuration(10, TimeUnit.SECONDS);
// try at most for 30 seconds
final long deadline = System.currentTimeMillis() + 30000;
JobID jobId = null;
do {
Future<Object> response = Patterns.ask(jobManager,
JobManagerMessages.getRequestRunningJobsStatus(), new Timeout(askTimeout));
Object result;
try {
result = Await.result(response, askTimeout);
}
catch (Exception e) {
throw new Exception("Could not retrieve running jobs from the JobManager.", e);
}
if (result instanceof JobManagerMessages.RunningJobsStatus) {
List<JobStatusMessage> jobs = ((JobManagerMessages.RunningJobsStatus) result).getStatusMessages();
if (jobs.size() == 1) {
jobId = jobs.get(0).getJobId();
break;
}
}
}
while (System.currentTimeMillis() < deadline);
if (jobId == null) {
// we never found it running, must have failed already
return;
}
// tell the JobManager to cancel the job
jobManager.tell(
new JobManagerMessages.LeaderSessionMessage(
HighAvailabilityServices.DEFAULT_LEADER_ID,
new JobManagerMessages.CancelJob(jobId)),
ActorRef.noSender());
}
private void waitUntilNumTaskManagersAreRegistered(ActorRef jobManager, int numExpected, long maxDelay)
throws Exception
{
final long deadline = System.currentTimeMillis() + maxDelay;
while (true) {
long remaining = deadline - System.currentTimeMillis();
if (remaining <= 0) {
fail("The TaskManagers did not register within the expected time (" + maxDelay + "msecs)");
}
FiniteDuration timeout = new FiniteDuration(remaining, TimeUnit.MILLISECONDS);
try {
Future<?> result = Patterns.ask(jobManager,
JobManagerMessages.getRequestNumberRegisteredTaskManager(),
new Timeout(timeout));
Integer numTMs = (Integer) Await.result(result, timeout);
if (numTMs == numExpected) {
break;
}
}
catch (TimeoutException e) {
// ignore and retry
}
catch (ClassCastException e) {
fail("Wrong response: " + e.getMessage());
}
}
}
private void printProcessLog(String processName, String log) {
if (log == null || log.length() == 0) {
return;
}
System.out.println("-----------------------------------------");
System.out.println(" BEGIN SPAWNED PROCESS LOG FOR " + processName);
System.out.println("-----------------------------------------");
System.out.println(log);
System.out.println("-----------------------------------------");
System.out.println(" END SPAWNED PROCESS LOG");
System.out.println("-----------------------------------------");
}
}