/* * Licensed to Elasticsearch under one or more contributor * license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright * ownership. Elasticsearch licenses this file to you under * the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.elasticsearch.action.admin.cluster.node.tasks; import com.carrotsearch.randomizedtesting.RandomizedContext; import com.carrotsearch.randomizedtesting.generators.RandomNumbers; import org.elasticsearch.action.ActionListener; import org.elasticsearch.action.admin.cluster.node.tasks.cancel.CancelTasksRequest; import org.elasticsearch.action.admin.cluster.node.tasks.cancel.CancelTasksResponse; import org.elasticsearch.action.admin.cluster.node.tasks.list.ListTasksRequest; import org.elasticsearch.action.admin.cluster.node.tasks.list.ListTasksResponse; import org.elasticsearch.action.support.nodes.BaseNodeRequest; import org.elasticsearch.action.support.nodes.BaseNodesRequest; import org.elasticsearch.action.support.replication.ClusterStateCreationUtils; import org.elasticsearch.cluster.node.DiscoveryNode; import org.elasticsearch.cluster.service.ClusterService; import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.tasks.CancellableTask; import org.elasticsearch.tasks.Task; import org.elasticsearch.tasks.TaskCancelledException; import org.elasticsearch.tasks.TaskId; import org.elasticsearch.tasks.TaskInfo; import org.elasticsearch.threadpool.ThreadPool; import org.elasticsearch.transport.TransportService; import java.io.IOException; import java.util.Arrays; import java.util.Collection; import java.util.List; import java.util.Random; import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutionException; import java.util.concurrent.atomic.AtomicReference; import static org.elasticsearch.test.ClusterServiceUtils.setState; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.greaterThanOrEqualTo; import static org.hamcrest.Matchers.lessThanOrEqualTo; public class CancellableTasksTests extends TaskManagerTestCase { public static class CancellableNodeRequest extends BaseNodeRequest { protected String requestName; protected String nodeId; public CancellableNodeRequest() { super(); } public CancellableNodeRequest(CancellableNodesRequest request, String nodeId) { super(nodeId); requestName = request.requestName; this.nodeId = nodeId; } @Override public void readFrom(StreamInput in) throws IOException { super.readFrom(in); requestName = in.readString(); nodeId = in.readString(); } @Override public void writeTo(StreamOutput out) throws IOException { super.writeTo(out); out.writeString(requestName); out.writeString(nodeId); } @Override public String getDescription() { return "CancellableNodeRequest[" + requestName + ", " + nodeId + "]"; } @Override public Task createTask(long id, String type, String action, TaskId parentTaskId) { return new CancellableTask(id, type, action, getDescription(), parentTaskId) { @Override public boolean shouldCancelChildrenOnCancellation() { return false; } }; } } public static class CancellableNodesRequest extends BaseNodesRequest<CancellableNodesRequest> { private String requestName; private CancellableNodesRequest() { super(); } public CancellableNodesRequest(String requestName, String... nodesIds) { super(nodesIds); this.requestName = requestName; } @Override public void readFrom(StreamInput in) throws IOException { super.readFrom(in); requestName = in.readString(); } @Override public void writeTo(StreamOutput out) throws IOException { super.writeTo(out); out.writeString(requestName); } @Override public String getDescription() { return "CancellableNodesRequest[" + requestName + "]"; } @Override public Task createTask(long id, String type, String action, TaskId parentTaskId) { return new CancellableTask(id, type, action, getDescription(), parentTaskId) { @Override public boolean shouldCancelChildrenOnCancellation() { return true; } }; } } /** * Simulates a cancellable node-based task that can be used to block node tasks so they are guaranteed to be registered by task manager */ class CancellableTestNodesAction extends AbstractTestNodesAction<CancellableNodesRequest, CancellableNodeRequest> { // True if the node operation should get stuck until its cancelled final boolean shouldBlock; final CountDownLatch actionStartedLatch; CancellableTestNodesAction(Settings settings, String actionName, ThreadPool threadPool, ClusterService clusterService, TransportService transportService, boolean shouldBlock, CountDownLatch actionStartedLatch) { super(settings, actionName, threadPool, clusterService, transportService, CancellableNodesRequest::new, CancellableNodeRequest::new); this.shouldBlock = shouldBlock; this.actionStartedLatch = actionStartedLatch; } @Override protected CancellableNodeRequest newNodeRequest(String nodeId, CancellableNodesRequest request) { return new CancellableNodeRequest(request, nodeId); } @Override protected NodeResponse nodeOperation(CancellableNodeRequest request, Task task) { assert task instanceof CancellableTask; debugDelay(request.nodeId, "op1"); if (actionStartedLatch != null) { actionStartedLatch.countDown(); } debugDelay(request.nodeId, "op2"); if (shouldBlock) { // Simulate a job that takes forever to finish // Using periodic checks method to identify that the task was cancelled try { awaitBusy(() -> { if (((CancellableTask) task).isCancelled()) { throw new TaskCancelledException("Cancelled"); } return false; }); fail("It should have thrown an exception"); } catch (InterruptedException ex) { Thread.currentThread().interrupt(); } } debugDelay(request.nodeId, "op4"); return new NodeResponse(clusterService.localNode()); } @Override protected NodeResponse nodeOperation(CancellableNodeRequest request) { throw new UnsupportedOperationException("the task parameter is required"); } } private Task startCancellableTestNodesAction(boolean waitForActionToStart, int blockedNodesCount, ActionListener<NodesResponse> listener) throws InterruptedException { return startCancellableTestNodesAction(waitForActionToStart, randomSubsetOf(blockedNodesCount, testNodes), new CancellableNodesRequest("Test Request"), listener); } private Task startCancellableTestNodesAction(boolean waitForActionToStart, Collection<TestNode> blockOnNodes, CancellableNodesRequest request, ActionListener<NodesResponse> listener) throws InterruptedException { CountDownLatch actionLatch = waitForActionToStart ? new CountDownLatch(nodesCount) : null; CancellableTestNodesAction[] actions = new CancellableTestNodesAction[nodesCount]; for (int i = 0; i < testNodes.length; i++) { boolean shouldBlock = blockOnNodes.contains(testNodes[i]); logger.info("The action in the node [{}] should block: [{}]", testNodes[i].getNodeId(), shouldBlock); actions[i] = new CancellableTestNodesAction(CLUSTER_SETTINGS, "testAction", threadPool, testNodes[i] .clusterService, testNodes[i].transportService, shouldBlock, actionLatch); } Task task = actions[0].execute(request, listener); if (waitForActionToStart) { logger.info("Awaiting for all actions to start"); actionLatch.await(); logger.info("Done waiting for all actions to start"); } return task; } public void testBasicTaskCancellation() throws Exception { setupTestNodes(Settings.EMPTY); connectNodes(testNodes); CountDownLatch responseLatch = new CountDownLatch(1); boolean waitForActionToStart = randomBoolean(); logger.info("waitForActionToStart is set to {}", waitForActionToStart); final AtomicReference<NodesResponse> responseReference = new AtomicReference<>(); final AtomicReference<Throwable> throwableReference = new AtomicReference<>(); int blockedNodesCount = randomIntBetween(0, nodesCount); Task mainTask = startCancellableTestNodesAction(waitForActionToStart, blockedNodesCount, new ActionListener<NodesResponse>() { @Override public void onResponse(NodesResponse listTasksResponse) { responseReference.set(listTasksResponse); responseLatch.countDown(); } @Override public void onFailure(Exception e) { throwableReference.set(e); responseLatch.countDown(); } }); // Cancel main task CancelTasksRequest request = new CancelTasksRequest(); request.setReason("Testing Cancellation"); request.setTaskId(new TaskId(testNodes[0].getNodeId(), mainTask.getId())); // And send the cancellation request to a random node CancelTasksResponse response = testNodes[randomIntBetween(0, testNodes.length - 1)].transportCancelTasksAction.execute(request) .get(); // Awaiting for the main task to finish responseLatch.await(); if (response.getTasks().size() == 0) { // We didn't cancel the request and it finished successfully // That should be rare and can be only in case we didn't block on a single node assertEquals(0, blockedNodesCount); // Make sure that the request was successful assertNull(throwableReference.get()); assertNotNull(responseReference.get()); assertEquals(nodesCount, responseReference.get().getNodes().size()); assertEquals(0, responseReference.get().failureCount()); } else { // We canceled the request, in this case it should have fail, but we should get partial response assertNull(throwableReference.get()); assertEquals(nodesCount, responseReference.get().failureCount() + responseReference.get().getNodes().size()); // and we should have at least as many failures as the number of blocked operations // (we might have cancelled some non-blocked operations before they even started and that's ok) assertThat(responseReference.get().failureCount(), greaterThanOrEqualTo(blockedNodesCount)); // We should have the information about the cancelled task in the cancel operation response assertEquals(1, response.getTasks().size()); assertEquals(mainTask.getId(), response.getTasks().get(0).getId()); // Verify that all cancelled tasks reported that they support cancellation for(TaskInfo taskInfo : response.getTasks()) { assertTrue(taskInfo.isCancellable()); } } // Make sure that tasks are no longer running ListTasksResponse listTasksResponse = testNodes[randomIntBetween(0, testNodes.length - 1)] .transportListTasksAction.execute(new ListTasksRequest().setTaskId( new TaskId(testNodes[0].getNodeId(), mainTask.getId()))).get(); assertEquals(0, listTasksResponse.getTasks().size()); // Make sure that there are no leftover bans, the ban removal is async, so we might return from the cancellation // while the ban is still there, but it should disappear shortly assertBusy(() -> { for (int i = 0; i < testNodes.length; i++) { assertEquals("No bans on the node " + i, 0, testNodes[i].transportService.getTaskManager().getBanCount()); } }); } public void testChildTasksCancellation() throws Exception { setupTestNodes(Settings.EMPTY); connectNodes(testNodes); CountDownLatch responseLatch = new CountDownLatch(1); final AtomicReference<NodesResponse> responseReference = new AtomicReference<>(); final AtomicReference<Throwable> throwableReference = new AtomicReference<>(); Task mainTask = startCancellableTestNodesAction(true, nodesCount, new ActionListener<NodesResponse>() { @Override public void onResponse(NodesResponse listTasksResponse) { responseReference.set(listTasksResponse); responseLatch.countDown(); } @Override public void onFailure(Exception e) { throwableReference.set(e); responseLatch.countDown(); } }); // Cancel all child tasks without cancelling the main task, which should quit on its own CancelTasksRequest request = new CancelTasksRequest(); request.setReason("Testing Cancellation"); request.setParentTaskId(new TaskId(testNodes[0].getNodeId(), mainTask.getId())); // And send the cancellation request to a random node CancelTasksResponse response = testNodes[randomIntBetween(1, testNodes.length - 1)].transportCancelTasksAction.execute(request) .get(); // Awaiting for the main task to finish responseLatch.await(); // Should have cancelled tasks on all nodes assertThat(response.getTasks().size(), equalTo(testNodes.length)); assertBusy(() -> { try { // Make sure that main task is no longer running ListTasksResponse listTasksResponse = testNodes[randomIntBetween(0, testNodes.length - 1)] .transportListTasksAction.execute(new ListTasksRequest().setTaskId( new TaskId(testNodes[0].getNodeId(), mainTask.getId()))).get(); assertEquals(0, listTasksResponse.getTasks().size()); } catch (ExecutionException | InterruptedException ex) { throw new RuntimeException(ex); } }); } public void testTaskCancellationOnCoordinatingNodeLeavingTheCluster() throws Exception { setupTestNodes(Settings.EMPTY); connectNodes(testNodes); CountDownLatch responseLatch = new CountDownLatch(1); boolean simulateBanBeforeLeaving = randomBoolean(); final AtomicReference<NodesResponse> responseReference = new AtomicReference<>(); final AtomicReference<Throwable> throwableReference = new AtomicReference<>(); int blockedNodesCount = randomIntBetween(0, nodesCount - 1); // We shouldn't block on the first node since it's leaving the cluster anyway so it doesn't matter List<TestNode> blockOnNodes = randomSubsetOf(blockedNodesCount, Arrays.copyOfRange(testNodes, 1, nodesCount)); Task mainTask = startCancellableTestNodesAction(true, blockOnNodes, new CancellableNodesRequest("Test Request"), new ActionListener<NodesResponse>() { @Override public void onResponse(NodesResponse listTasksResponse) { responseReference.set(listTasksResponse); responseLatch.countDown(); } @Override public void onFailure(Exception e) { throwableReference.set(e); responseLatch.countDown(); } }); String mainNode = testNodes[0].getNodeId(); // Make sure that tasks are running ListTasksResponse listTasksResponse = testNodes[randomIntBetween(0, testNodes.length - 1)] .transportListTasksAction.execute(new ListTasksRequest().setParentTaskId(new TaskId(mainNode, mainTask.getId()))).get(); assertThat(listTasksResponse.getTasks().size(), greaterThanOrEqualTo(blockOnNodes.size())); // Simulate the coordinating node leaving the cluster DiscoveryNode[] discoveryNodes = new DiscoveryNode[testNodes.length - 1]; for (int i = 1; i < testNodes.length; i++) { discoveryNodes[i - 1] = testNodes[i].discoveryNode(); } DiscoveryNode master = discoveryNodes[0]; for (int i = 1; i < testNodes.length; i++) { // Notify only nodes that should remain in the cluster setState(testNodes[i].clusterService, ClusterStateCreationUtils.state(testNodes[i].discoveryNode(), master, discoveryNodes)); } if (simulateBanBeforeLeaving) { logger.info("--> Simulate issuing cancel request on the node that is about to leave the cluster"); // Simulate issuing cancel request on the node that is about to leave the cluster CancelTasksRequest request = new CancelTasksRequest(); request.setReason("Testing Cancellation"); request.setTaskId(new TaskId(testNodes[0].getNodeId(), mainTask.getId())); // And send the cancellation request to a random node CancelTasksResponse response = testNodes[0].transportCancelTasksAction.execute(request).get(); logger.info("--> Done simulating issuing cancel request on the node that is about to leave the cluster"); // This node still thinks that's part of the cluster, so cancelling should look successful if (response.getTasks().size() == 0) { logger.error("!!!!"); } assertThat(response.getTasks().size(), lessThanOrEqualTo(1)); assertThat(response.getTaskFailures().size(), lessThanOrEqualTo(1)); assertThat(response.getTaskFailures().size() + response.getTasks().size(), lessThanOrEqualTo(1)); } for (int i = 1; i < testNodes.length; i++) { assertEquals("No bans on the node " + i, 0, testNodes[i].transportService.getTaskManager().getBanCount()); } // Close the first node testNodes[0].close(); assertBusy(() -> { // Make sure that tasks are no longer running try { ListTasksResponse listTasksResponse1 = testNodes[randomIntBetween(1, testNodes.length - 1)] .transportListTasksAction.execute(new ListTasksRequest().setTaskId(new TaskId(mainNode, mainTask.getId()))).get(); assertEquals(0, listTasksResponse1.getTasks().size()); } catch (InterruptedException ex) { Thread.currentThread().interrupt(); } catch (ExecutionException ex2) { fail("shouldn't be here"); } }); // Wait for clean up responseLatch.await(); } private static void debugDelay(String nodeId, String name) { // Introduce an additional pseudo random repeatable race conditions String delayName = RandomizedContext.current().getRunnerSeedAsString() + ":" + nodeId + ":" + name; Random random = new Random(delayName.hashCode()); if (RandomNumbers.randomIntBetween(random, 0, 10) < 1) { try { Thread.sleep(RandomNumbers.randomIntBetween(random, 20, 50)); } catch (InterruptedException ex) { Thread.currentThread().interrupt(); } } } }