/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.runtime.executiongraph;
import static org.apache.flink.runtime.executiongraph.ExecutionGraphTestUtils.*;
import static org.junit.Assert.*;
import static org.mockito.Mockito.mock;
import java.io.IOException;
import org.apache.flink.runtime.akka.AkkaUtils;
import org.apache.flink.runtime.execution.ExecutionState;
import org.apache.flink.runtime.instance.BaseTestingActorGateway;
import org.apache.flink.runtime.instance.DummyActorGateway;
import org.apache.flink.runtime.instance.ActorGateway;
import org.apache.flink.runtime.instance.SimpleSlot;
import org.apache.flink.runtime.instance.Instance;
import org.apache.flink.api.common.JobID;
import org.apache.flink.runtime.jobgraph.JobStatus;
import org.apache.flink.runtime.jobgraph.JobVertexID;
import org.apache.flink.runtime.jobmanager.scheduler.Scheduler;
import org.apache.flink.runtime.jobmanager.slots.ActorTaskManagerGateway;
import org.apache.flink.runtime.messages.Acknowledge;
import org.apache.flink.runtime.messages.TaskMessages.SubmitTask;
import org.apache.flink.runtime.messages.TaskMessages.CancelTask;
import org.apache.flink.runtime.testingUtils.TestingUtils;
import org.apache.flink.runtime.testutils.DirectScheduledExecutorService;
import org.junit.Test;
import scala.concurrent.ExecutionContext;
@SuppressWarnings("serial")
public class ExecutionVertexCancelTest {
// --------------------------------------------------------------------------------------------
// Canceling in different states
// --------------------------------------------------------------------------------------------
@Test
public void testCancelFromCreated() {
try {
final JobVertexID jid = new JobVertexID();
final ExecutionJobVertex ejv = getExecutionVertex(jid);
final ExecutionVertex vertex = new ExecutionVertex(ejv, 0, new IntermediateResult[0],
AkkaUtils.getDefaultTimeout());
assertEquals(ExecutionState.CREATED, vertex.getExecutionState());
vertex.cancel();
assertEquals(ExecutionState.CANCELED, vertex.getExecutionState());
assertNull(vertex.getFailureCause());
assertTrue(vertex.getStateTimestamp(ExecutionState.CREATED) > 0);
assertTrue(vertex.getStateTimestamp(ExecutionState.CANCELING) > 0);
assertTrue(vertex.getStateTimestamp(ExecutionState.CANCELED) > 0);
}
catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}
@Test
public void testCancelFromScheduled() {
try {
final JobVertexID jid = new JobVertexID();
final ExecutionJobVertex ejv = getExecutionVertex(jid);
final ExecutionVertex vertex = new ExecutionVertex(ejv, 0, new IntermediateResult[0],
AkkaUtils.getDefaultTimeout());
setVertexState(vertex, ExecutionState.SCHEDULED);
assertEquals(ExecutionState.SCHEDULED, vertex.getExecutionState());
vertex.cancel();
assertEquals(ExecutionState.CANCELED, vertex.getExecutionState());
assertNull(vertex.getFailureCause());
assertTrue(vertex.getStateTimestamp(ExecutionState.CREATED) > 0);
assertTrue(vertex.getStateTimestamp(ExecutionState.CANCELING) > 0);
assertTrue(vertex.getStateTimestamp(ExecutionState.CANCELED) > 0);
}
catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}
@Test
public void testCancelConcurrentlyToDeploying_CallsNotOvertaking() {
try {
final JobVertexID jid = new JobVertexID();
final TestingUtils.QueuedActionExecutionContext executionContext = TestingUtils.queuedActionExecutionContext();
final TestingUtils.ActionQueue actions = executionContext.actionQueue();
final ExecutionJobVertex ejv = getExecutionVertex(
jid,
executionContext
);
final ExecutionVertex vertex = new ExecutionVertex(ejv, 0, new IntermediateResult[0],
AkkaUtils.getDefaultTimeout());
setVertexState(vertex, ExecutionState.SCHEDULED);
assertEquals(ExecutionState.SCHEDULED, vertex.getExecutionState());
ActorGateway actorGateway = new CancelSequenceActorGateway(
executionContext, 2);
Instance instance = getInstance(new ActorTaskManagerGateway(actorGateway));
SimpleSlot slot = instance.allocateSimpleSlot(new JobID());
vertex.deployToSlot(slot);
assertEquals(ExecutionState.DEPLOYING, vertex.getExecutionState());
vertex.cancel();
assertEquals(ExecutionState.CANCELING, vertex.getExecutionState());
// first action happens (deploy)
actions.triggerNextAction();
assertEquals(ExecutionState.CANCELING, vertex.getExecutionState());
// the deploy call found itself in canceling after it returned and needs to send a cancel call
// the call did not yet execute, so it is still in canceling
assertEquals(ExecutionState.CANCELING, vertex.getExecutionState());
// second action happens (cancel call from cancel function)
actions.triggerNextAction();
// TaskManager reports back (canceling done)
vertex.getCurrentExecutionAttempt().cancelingComplete();
// should properly set state to cancelled
assertEquals(ExecutionState.CANCELED, vertex.getExecutionState());
// trigger the correction canceling call
actions.triggerNextAction();
assertEquals(ExecutionState.CANCELED, vertex.getExecutionState());
assertTrue(slot.isReleased());
assertNull(vertex.getFailureCause());
assertTrue(vertex.getStateTimestamp(ExecutionState.CREATED) > 0);
assertTrue(vertex.getStateTimestamp(ExecutionState.CANCELING) > 0);
assertTrue(vertex.getStateTimestamp(ExecutionState.CANCELED) > 0);
}
catch(Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}
@Test
public void testCancelConcurrentlyToDeploying_CallsOvertaking() {
try {
final JobVertexID jid = new JobVertexID();
final TestingUtils.QueuedActionExecutionContext executionContext = TestingUtils.queuedActionExecutionContext();
final TestingUtils.ActionQueue actions = executionContext.actionQueue();
final ExecutionJobVertex ejv = getExecutionVertex(jid, executionContext);
final ExecutionVertex vertex = new ExecutionVertex(ejv, 0, new IntermediateResult[0],
AkkaUtils.getDefaultTimeout());
setVertexState(vertex, ExecutionState.SCHEDULED);
assertEquals(ExecutionState.SCHEDULED, vertex.getExecutionState());
// task manager cancel sequence mock actor
// first return NOT SUCCESS (task not found, cancel call overtook deploy call), then success (cancel call after deploy call)
ActorGateway actorGateway = new CancelSequenceActorGateway(
executionContext,
2);
Instance instance = getInstance(new ActorTaskManagerGateway(actorGateway));
SimpleSlot slot = instance.allocateSimpleSlot(new JobID());
vertex.deployToSlot(slot);
assertEquals(ExecutionState.DEPLOYING, vertex.getExecutionState());
vertex.cancel();
assertEquals(ExecutionState.CANCELING, vertex.getExecutionState());
// first action happens (deploy)
Runnable deployAction = actions.popNextAction();
Runnable cancelAction = actions.popNextAction();
// cancel call first
cancelAction.run();
// process onComplete callback
actions.triggerNextAction();
// did not find the task, not properly cancelled, stay in canceling
assertEquals(ExecutionState.CANCELING, vertex.getExecutionState());
// deploy action next
deployAction.run();
// the deploy call found itself in canceling after it returned and needs to send a cancel call
// the call did not yet execute, so it is still in canceling
assertEquals(ExecutionState.CANCELING, vertex.getExecutionState());
vertex.getCurrentExecutionAttempt().cancelingComplete();
assertEquals(ExecutionState.CANCELED, vertex.getExecutionState());
assertTrue(slot.isReleased());
assertNull(vertex.getFailureCause());
assertTrue(vertex.getStateTimestamp(ExecutionState.CREATED) > 0);
assertTrue(vertex.getStateTimestamp(ExecutionState.CANCELING) > 0);
assertTrue(vertex.getStateTimestamp(ExecutionState.CANCELED) > 0);
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}
@Test
public void testCancelFromRunning() {
try {
final JobVertexID jid = new JobVertexID();
final ExecutionJobVertex ejv = getExecutionVertex(jid, new DirectScheduledExecutorService());
final ExecutionVertex vertex = new ExecutionVertex(ejv, 0, new IntermediateResult[0],
AkkaUtils.getDefaultTimeout());
ActorGateway actorGateway = new CancelSequenceActorGateway(
TestingUtils.directExecutionContext(),
1);
Instance instance = getInstance(new ActorTaskManagerGateway(actorGateway));
SimpleSlot slot = instance.allocateSimpleSlot(new JobID());
setVertexState(vertex, ExecutionState.RUNNING);
setVertexResource(vertex, slot);
assertEquals(ExecutionState.RUNNING, vertex.getExecutionState());
vertex.cancel();
vertex.getCurrentExecutionAttempt().cancelingComplete(); // response by task manager once actually canceled
assertEquals(ExecutionState.CANCELED, vertex.getExecutionState());
assertTrue(slot.isReleased());
assertNull(vertex.getFailureCause());
assertTrue(vertex.getStateTimestamp(ExecutionState.CREATED) > 0);
assertTrue(vertex.getStateTimestamp(ExecutionState.CANCELING) > 0);
assertTrue(vertex.getStateTimestamp(ExecutionState.CANCELED) > 0);
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}
@Test
public void testRepeatedCancelFromRunning() {
try {
final JobVertexID jid = new JobVertexID();
final ExecutionJobVertex ejv = getExecutionVertex(jid, new DirectScheduledExecutorService());
final ExecutionVertex vertex = new ExecutionVertex(ejv, 0, new IntermediateResult[0],
AkkaUtils.getDefaultTimeout());
final ActorGateway actorGateway = new CancelSequenceActorGateway(
TestingUtils.directExecutionContext(),
1);
Instance instance = getInstance(new ActorTaskManagerGateway(actorGateway));
SimpleSlot slot = instance.allocateSimpleSlot(new JobID());
setVertexState(vertex, ExecutionState.RUNNING);
setVertexResource(vertex, slot);
assertEquals(ExecutionState.RUNNING, vertex.getExecutionState());
vertex.cancel();
assertEquals(ExecutionState.CANCELING, vertex.getExecutionState());
vertex.cancel();
assertEquals(ExecutionState.CANCELING, vertex.getExecutionState());
// callback by TaskManager after canceling completes
vertex.getCurrentExecutionAttempt().cancelingComplete();
assertEquals(ExecutionState.CANCELED, vertex.getExecutionState());
assertTrue(slot.isReleased());
assertNull(vertex.getFailureCause());
assertTrue(vertex.getStateTimestamp(ExecutionState.CREATED) > 0);
assertTrue(vertex.getStateTimestamp(ExecutionState.CANCELING) > 0);
assertTrue(vertex.getStateTimestamp(ExecutionState.CANCELED) > 0);
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}
@Test
public void testCancelFromRunningDidNotFindTask() {
// this may happen when the task finished or failed while the call was in progress
try {
final JobVertexID jid = new JobVertexID();
final ExecutionJobVertex ejv = getExecutionVertex(jid, new DirectScheduledExecutorService());
final ExecutionVertex vertex = new ExecutionVertex(ejv, 0, new IntermediateResult[0],
AkkaUtils.getDefaultTimeout());
final ActorGateway actorGateway = new CancelSequenceActorGateway(
TestingUtils.directExecutionContext(),
1);
Instance instance = getInstance(new ActorTaskManagerGateway(actorGateway));
SimpleSlot slot = instance.allocateSimpleSlot(new JobID());
setVertexState(vertex, ExecutionState.RUNNING);
setVertexResource(vertex, slot);
assertEquals(ExecutionState.RUNNING, vertex.getExecutionState());
vertex.cancel();
assertEquals(ExecutionState.CANCELING, vertex.getExecutionState());
assertNull(vertex.getFailureCause());
assertTrue(vertex.getStateTimestamp(ExecutionState.CREATED) > 0);
assertTrue(vertex.getStateTimestamp(ExecutionState.CANCELING) > 0);
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}
@Test
public void testCancelCallFails() {
try {
final JobVertexID jid = new JobVertexID();
final ExecutionJobVertex ejv = getExecutionVertex(jid, new DirectScheduledExecutorService());
final ExecutionVertex vertex = new ExecutionVertex(ejv, 0, new IntermediateResult[0],
AkkaUtils.getDefaultTimeout());
final ActorGateway gateway = new CancelSequenceActorGateway(TestingUtils.directExecutionContext(), 0);
Instance instance = getInstance(new ActorTaskManagerGateway(gateway));
SimpleSlot slot = instance.allocateSimpleSlot(new JobID());
setVertexState(vertex, ExecutionState.RUNNING);
setVertexResource(vertex, slot);
assertEquals(ExecutionState.RUNNING, vertex.getExecutionState());
vertex.cancel();
// Callback fails, leading to CANCELED
assertEquals(ExecutionState.CANCELED, vertex.getExecutionState());
assertTrue(slot.isReleased());
assertTrue(vertex.getStateTimestamp(ExecutionState.CREATED) > 0);
assertTrue(vertex.getStateTimestamp(ExecutionState.CANCELING) > 0);
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}
@Test
public void testSendCancelAndReceiveFail() throws Exception {
final ExecutionGraph graph = ExecutionGraphTestUtils.createSimpleTestGraph();
graph.scheduleForExecution();
ExecutionGraphTestUtils.switchAllVerticesToRunning(graph);
assertEquals(JobStatus.RUNNING, graph.getState());
final ExecutionVertex[] vertices = graph.getVerticesTopologically().iterator().next().getTaskVertices();
assertEquals(vertices.length, graph.getRegisteredExecutions().size());
final Execution exec = vertices[3].getCurrentExecutionAttempt();
exec.cancel();
assertEquals(ExecutionState.CANCELING, exec.getState());
exec.markFailed(new Exception("test"));
assertTrue(exec.getState() == ExecutionState.FAILED || exec.getState() == ExecutionState.CANCELED);
assertTrue(exec.getAssignedResource().isCanceled());
assertEquals(vertices.length - 1, exec.getVertex().getExecutionGraph().getRegisteredExecutions().size());
}
// --------------------------------------------------------------------------------------------
// Actions after a vertex has been canceled or while canceling
// --------------------------------------------------------------------------------------------
@Test
public void testScheduleOrDeployAfterCancel() {
try {
final JobVertexID jid = new JobVertexID();
final ExecutionJobVertex ejv = getExecutionVertex(jid);
final ExecutionVertex vertex = new ExecutionVertex(ejv, 0, new IntermediateResult[0],
AkkaUtils.getDefaultTimeout());
setVertexState(vertex, ExecutionState.CANCELED);
assertEquals(ExecutionState.CANCELED, vertex.getExecutionState());
// 1)
// scheduling after being canceled should be tolerated (no exception) because
// it can occur as the result of races
{
Scheduler scheduler = mock(Scheduler.class);
vertex.scheduleForExecution(scheduler, false);
assertEquals(ExecutionState.CANCELED, vertex.getExecutionState());
}
// 2)
// deploying after canceling from CREATED needs to raise an exception, because
// the scheduler (or any caller) needs to know that the slot should be released
try {
Instance instance = getInstance(new ActorTaskManagerGateway(DummyActorGateway.INSTANCE));
SimpleSlot slot = instance.allocateSimpleSlot(new JobID());
vertex.deployToSlot(slot);
fail("Method should throw an exception");
}
catch (IllegalStateException e) {
assertEquals(ExecutionState.CANCELED, vertex.getExecutionState());
}
}
catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}
@Test
public void testActionsWhileCancelling() {
try {
final JobVertexID jid = new JobVertexID();
final ExecutionJobVertex ejv = getExecutionVertex(jid);
// scheduling while canceling is an illegal state transition
try {
ExecutionVertex vertex = new ExecutionVertex(ejv, 0, new IntermediateResult[0],
AkkaUtils.getDefaultTimeout());
setVertexState(vertex, ExecutionState.CANCELING);
Scheduler scheduler = mock(Scheduler.class);
vertex.scheduleForExecution(scheduler, false);
}
catch (Exception e) {
fail("should not throw an exception");
}
// deploying while in canceling state is illegal (should immediately go to canceled)
try {
ExecutionVertex vertex = new ExecutionVertex(ejv, 0, new IntermediateResult[0],
AkkaUtils.getDefaultTimeout());
setVertexState(vertex, ExecutionState.CANCELING);
Instance instance = getInstance(new ActorTaskManagerGateway(DummyActorGateway.INSTANCE));
SimpleSlot slot = instance.allocateSimpleSlot(new JobID());
vertex.deployToSlot(slot);
fail("Method should throw an exception");
}
catch (IllegalStateException e) {
// that is what we expect
}
// fail while canceling
{
ExecutionVertex vertex = new ExecutionVertex(ejv, 0, new IntermediateResult[0],
AkkaUtils.getDefaultTimeout());
Instance instance = getInstance(new ActorTaskManagerGateway(DummyActorGateway.INSTANCE));
SimpleSlot slot = instance.allocateSimpleSlot(new JobID());
setVertexResource(vertex, slot);
setVertexState(vertex, ExecutionState.CANCELING);
Exception failureCause = new Exception("test exception");
vertex.fail(failureCause);
assertEquals(ExecutionState.CANCELED, vertex.getExecutionState());
assertTrue(slot.isReleased());
}
}
catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}
public static class CancelSequenceActorGateway extends BaseTestingActorGateway {
private final int successfulOperations;
private int index = -1;
public CancelSequenceActorGateway(ExecutionContext executionContext, int successfulOperations) {
super(executionContext);
this.successfulOperations = successfulOperations;
}
@Override
public Object handleMessage(Object message) throws Exception {
Object result;
if(message instanceof SubmitTask) {
result = Acknowledge.get();
} else if(message instanceof CancelTask) {
index++;
if(index >= successfulOperations){
throw new IOException("RPC call failed.");
} else {
result = Acknowledge.get();
}
} else {
result = null;
}
return result;
}
}
}