/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.runtime.executiongraph; import static org.apache.flink.runtime.executiongraph.ExecutionGraphTestUtils.getInstance; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.fail; import static org.junit.Assert.assertNotEquals; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.concurrent.ScheduledExecutorService; import org.apache.flink.api.common.ExecutionConfig; import org.apache.flink.api.common.accumulators.Accumulator; import org.apache.flink.api.common.accumulators.IntCounter; import org.apache.flink.api.common.time.Time; import org.apache.flink.api.java.tuple.Tuple2; import org.apache.flink.configuration.Configuration; import org.apache.flink.configuration.CoreOptions; import org.apache.flink.metrics.groups.UnregisteredMetricsGroup; import org.apache.flink.runtime.accumulators.AccumulatorSnapshot; import org.apache.flink.runtime.akka.AkkaUtils; import org.apache.flink.runtime.checkpoint.StandaloneCheckpointRecoveryFactory; import org.apache.flink.runtime.deployment.InputGateDeploymentDescriptor; import org.apache.flink.runtime.deployment.ResultPartitionDeploymentDescriptor; import org.apache.flink.runtime.deployment.TaskDeploymentDescriptor; import org.apache.flink.runtime.execution.ExecutionState; import org.apache.flink.runtime.executiongraph.restart.NoRestartStrategy; import org.apache.flink.runtime.instance.Instance; import org.apache.flink.runtime.instance.SimpleSlot; import org.apache.flink.runtime.io.network.partition.ResultPartitionType; import org.apache.flink.runtime.jobgraph.JobStatus; import org.apache.flink.runtime.jobgraph.JobVertex; import org.apache.flink.runtime.jobgraph.DistributionPattern; import org.apache.flink.runtime.jobgraph.JobGraph; import org.apache.flink.api.common.JobID; import org.apache.flink.runtime.jobgraph.JobVertexID; import org.apache.flink.runtime.jobgraph.tasks.ExternalizedCheckpointSettings; import org.apache.flink.runtime.jobgraph.tasks.JobCheckpointingSettings; import org.apache.flink.runtime.jobmanager.scheduler.Scheduler; import org.apache.flink.runtime.jobmanager.slots.ActorTaskManagerGateway; import org.apache.flink.runtime.operators.BatchTask; import org.apache.flink.runtime.taskmanager.TaskExecutionState; import org.apache.flink.runtime.testingUtils.TestingUtils; import org.apache.flink.runtime.testutils.DirectScheduledExecutorService; import org.apache.flink.util.SerializedValue; import org.junit.Test; import org.slf4j.LoggerFactory; public class ExecutionGraphDeploymentTest { @Test public void testBuildDeploymentDescriptor() { try { final JobID jobId = new JobID(); final JobVertexID jid1 = new JobVertexID(); final JobVertexID jid2 = new JobVertexID(); final JobVertexID jid3 = new JobVertexID(); final JobVertexID jid4 = new JobVertexID(); JobVertex v1 = new JobVertex("v1", jid1); JobVertex v2 = new JobVertex("v2", jid2); JobVertex v3 = new JobVertex("v3", jid3); JobVertex v4 = new JobVertex("v4", jid4); v1.setParallelism(10); v2.setParallelism(10); v3.setParallelism(10); v4.setParallelism(10); v1.setInvokableClass(BatchTask.class); v2.setInvokableClass(BatchTask.class); v3.setInvokableClass(BatchTask.class); v4.setInvokableClass(BatchTask.class); v2.connectNewDataSetAsInput(v1, DistributionPattern.ALL_TO_ALL, ResultPartitionType.PIPELINED); v3.connectNewDataSetAsInput(v2, DistributionPattern.ALL_TO_ALL, ResultPartitionType.PIPELINED); v4.connectNewDataSetAsInput(v2, DistributionPattern.ALL_TO_ALL, ResultPartitionType.PIPELINED); ExecutionGraph eg = new ExecutionGraph( TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), jobId, "some job", new Configuration(), new SerializedValue<>(new ExecutionConfig()), AkkaUtils.getDefaultTimeout(), new NoRestartStrategy(), new Scheduler(TestingUtils.defaultExecutionContext())); List<JobVertex> ordered = Arrays.asList(v1, v2, v3, v4); eg.attachJobGraph(ordered); ExecutionJobVertex ejv = eg.getAllVertices().get(jid2); ExecutionVertex vertex = ejv.getTaskVertices()[3]; ExecutionGraphTestUtils.SimpleActorGateway instanceGateway = new ExecutionGraphTestUtils.SimpleActorGateway(TestingUtils.directExecutionContext()); final Instance instance = getInstance(new ActorTaskManagerGateway(instanceGateway)); final SimpleSlot slot = instance.allocateSimpleSlot(jobId); assertEquals(ExecutionState.CREATED, vertex.getExecutionState()); vertex.deployToSlot(slot); assertEquals(ExecutionState.DEPLOYING, vertex.getExecutionState()); TaskDeploymentDescriptor descr = instanceGateway.lastTDD; assertNotNull(descr); JobInformation jobInformation = descr.getSerializedJobInformation().deserializeValue(getClass().getClassLoader()); TaskInformation taskInformation = descr.getSerializedTaskInformation().deserializeValue(getClass().getClassLoader()); assertEquals(jobId, jobInformation.getJobId()); assertEquals(jid2, taskInformation.getJobVertexId()); assertEquals(3, descr.getSubtaskIndex()); assertEquals(10, taskInformation.getNumberOfSubtasks()); assertEquals(BatchTask.class.getName(), taskInformation.getInvokableClassName()); assertEquals("v2", taskInformation.getTaskName()); Collection<ResultPartitionDeploymentDescriptor> producedPartitions = descr.getProducedPartitions(); Collection<InputGateDeploymentDescriptor> consumedPartitions = descr.getInputGates(); assertEquals(2, producedPartitions.size()); assertEquals(1, consumedPartitions.size()); Iterator<ResultPartitionDeploymentDescriptor> iteratorProducedPartitions = producedPartitions.iterator(); Iterator<InputGateDeploymentDescriptor> iteratorConsumedPartitions = consumedPartitions.iterator(); assertEquals(10, iteratorProducedPartitions.next().getNumberOfSubpartitions()); assertEquals(10, iteratorProducedPartitions.next().getNumberOfSubpartitions()); assertEquals(10, iteratorConsumedPartitions.next().getInputChannelDeploymentDescriptors().length); } catch (Exception e) { e.printStackTrace(); fail(e.getMessage()); } } @Test public void testRegistrationOfExecutionsFinishing() { try { final JobVertexID jid1 = new JobVertexID(); final JobVertexID jid2 = new JobVertexID(); JobVertex v1 = new JobVertex("v1", jid1); JobVertex v2 = new JobVertex("v2", jid2); Map<ExecutionAttemptID, Execution> executions = setupExecution(v1, 7650, v2, 2350).f1; for (Execution e : executions.values()) { e.markFinished(); } assertEquals(0, executions.size()); } catch (Exception e) { e.printStackTrace(); fail(e.getMessage()); } } @Test public void testRegistrationOfExecutionsFailing() { try { final JobVertexID jid1 = new JobVertexID(); final JobVertexID jid2 = new JobVertexID(); JobVertex v1 = new JobVertex("v1", jid1); JobVertex v2 = new JobVertex("v2", jid2); Map<ExecutionAttemptID, Execution> executions = setupExecution(v1, 7, v2, 6).f1; for (Execution e : executions.values()) { e.markFailed(null); } assertEquals(0, executions.size()); } catch (Exception e) { e.printStackTrace(); fail(e.getMessage()); } } @Test public void testRegistrationOfExecutionsFailedExternally() { try { final JobVertexID jid1 = new JobVertexID(); final JobVertexID jid2 = new JobVertexID(); JobVertex v1 = new JobVertex("v1", jid1); JobVertex v2 = new JobVertex("v2", jid2); Map<ExecutionAttemptID, Execution> executions = setupExecution(v1, 7, v2, 6).f1; for (Execution e : executions.values()) { e.fail(null); } assertEquals(0, executions.size()); } catch (Exception e) { e.printStackTrace(); fail(e.getMessage()); } } /** * Verifies that {@link ExecutionGraph#updateState(TaskExecutionState)} updates the accumulators and metrics for an * execution that failed or was canceled. */ @Test public void testAccumulatorsAndMetricsForwarding() throws Exception { final JobVertexID jid1 = new JobVertexID(); final JobVertexID jid2 = new JobVertexID(); JobVertex v1 = new JobVertex("v1", jid1); JobVertex v2 = new JobVertex("v2", jid2); Tuple2<ExecutionGraph, Map<ExecutionAttemptID, Execution>> graphAndExecutions = setupExecution(v1, 1, v2, 1); ExecutionGraph graph = graphAndExecutions.f0; // verify behavior for canceled executions Execution execution1 = graphAndExecutions.f1.values().iterator().next(); IOMetrics ioMetrics = new IOMetrics(0, 0, 0, 0, 0, 0.0, 0.0, 0.0, 0.0, 0.0); Map<String, Accumulator<?, ?>> accumulators = new HashMap<>(); accumulators.put("acc", new IntCounter(4)); AccumulatorSnapshot accumulatorSnapshot = new AccumulatorSnapshot(graph.getJobID(), execution1.getAttemptId(), accumulators); TaskExecutionState state = new TaskExecutionState(graph.getJobID(), execution1.getAttemptId(), ExecutionState.CANCELED, null, accumulatorSnapshot, ioMetrics); graph.updateState(state); assertEquals(ioMetrics, execution1.getIOMetrics()); assertNotNull(execution1.getUserAccumulators()); assertEquals(4, execution1.getUserAccumulators().get("acc").getLocalValue()); // verify behavior for failed executions Execution execution2 = graphAndExecutions.f1.values().iterator().next(); IOMetrics ioMetrics2 = new IOMetrics(0, 0, 0, 0, 0, 0.0, 0.0, 0.0, 0.0, 0.0); Map<String, Accumulator<?, ?>> accumulators2 = new HashMap<>(); accumulators2.put("acc", new IntCounter(8)); AccumulatorSnapshot accumulatorSnapshot2 = new AccumulatorSnapshot(graph.getJobID(), execution2.getAttemptId(), accumulators2); TaskExecutionState state2 = new TaskExecutionState(graph.getJobID(), execution2.getAttemptId(), ExecutionState.FAILED, null, accumulatorSnapshot2, ioMetrics2); graph.updateState(state2); assertEquals(ioMetrics2, execution2.getIOMetrics()); assertNotNull(execution2.getUserAccumulators()); assertEquals(8, execution2.getUserAccumulators().get("acc").getLocalValue()); } /** * Verifies that {@link Execution#cancelingComplete(Map, IOMetrics)} and {@link Execution#markFailed(Throwable, Map, IOMetrics)} * store the given accumulators and metrics correctly. */ @Test public void testAccumulatorsAndMetricsStorage() throws Exception { final JobVertexID jid1 = new JobVertexID(); final JobVertexID jid2 = new JobVertexID(); JobVertex v1 = new JobVertex("v1", jid1); JobVertex v2 = new JobVertex("v2", jid2); Map<ExecutionAttemptID, Execution> executions = setupExecution(v1, 1, v2, 1).f1; IOMetrics ioMetrics = new IOMetrics(0, 0, 0, 0, 0, 0.0, 0.0, 0.0, 0.0, 0.0); Map<String, Accumulator<?, ?>> accumulators = Collections.emptyMap(); Execution execution1 = executions.values().iterator().next(); execution1.cancel(); execution1.cancelingComplete(accumulators, ioMetrics); assertEquals(ioMetrics, execution1.getIOMetrics()); assertEquals(accumulators, execution1.getUserAccumulators()); Execution execution2 = executions.values().iterator().next(); execution2.markFailed(new Throwable(), accumulators, ioMetrics); assertEquals(ioMetrics, execution2.getIOMetrics()); assertEquals(accumulators, execution2.getUserAccumulators()); } @Test public void testRegistrationOfExecutionsCanceled() { try { final JobVertexID jid1 = new JobVertexID(); final JobVertexID jid2 = new JobVertexID(); JobVertex v1 = new JobVertex("v1", jid1); JobVertex v2 = new JobVertex("v2", jid2); Map<ExecutionAttemptID, Execution> executions = setupExecution(v1, 19, v2, 37).f1; for (Execution e : executions.values()) { e.cancel(); e.cancelingComplete(); } assertEquals(0, executions.size()); } catch (Exception e) { e.printStackTrace(); fail(e.getMessage()); } } @Test /** * Tests that a blocking batch job fails if there are not enough resources left to schedule the * succeeding tasks. This test case is related to [FLINK-4296] where finished producing tasks * swallow the fail exception when scheduling a consumer task. */ public void testNoResourceAvailableFailure() throws Exception { final JobID jobId = new JobID(); JobVertex v1 = new JobVertex("source"); JobVertex v2 = new JobVertex("sink"); int dop1 = 1; int dop2 = 1; v1.setParallelism(dop1); v2.setParallelism(dop2); v1.setInvokableClass(BatchTask.class); v2.setInvokableClass(BatchTask.class); v2.connectNewDataSetAsInput(v1, DistributionPattern.POINTWISE, ResultPartitionType.BLOCKING); Scheduler scheduler = new Scheduler(TestingUtils.directExecutionContext()); for (int i = 0; i < dop1; i++) { scheduler.newInstanceAvailable( ExecutionGraphTestUtils.getInstance( new ActorTaskManagerGateway( new ExecutionGraphTestUtils.SimpleActorGateway( TestingUtils.directExecutionContext())))); } // execution graph that executes actions synchronously ExecutionGraph eg = new ExecutionGraph( new DirectScheduledExecutorService(), TestingUtils.defaultExecutor(), jobId, "failing test job", new Configuration(), new SerializedValue<>(new ExecutionConfig()), AkkaUtils.getDefaultTimeout(), new NoRestartStrategy(), scheduler); eg.setQueuedSchedulingAllowed(false); List<JobVertex> ordered = Arrays.asList(v1, v2); eg.attachJobGraph(ordered); assertEquals(dop1, scheduler.getNumberOfAvailableSlots()); // schedule, this triggers mock deployment eg.scheduleForExecution(); ExecutionAttemptID attemptID = eg.getJobVertex(v1.getID()).getTaskVertices()[0].getCurrentExecutionAttempt().getAttemptId(); eg.updateState(new TaskExecutionState(jobId, attemptID, ExecutionState.RUNNING)); eg.updateState(new TaskExecutionState(jobId, attemptID, ExecutionState.FINISHED, null)); assertEquals(JobStatus.FAILED, eg.getState()); } // ------------------------------------------------------------------------ // retained checkpoints config test // ------------------------------------------------------------------------ @Test public void testSettingDefaultMaxNumberOfCheckpointsToRetain() throws Exception { final Configuration jobManagerConfig = new Configuration(); final ExecutionGraph eg = createExecutionGraph(jobManagerConfig); assertEquals(CoreOptions.MAX_RETAINED_CHECKPOINTS.defaultValue().intValue(), eg.getCheckpointCoordinator().getCheckpointStore().getMaxNumberOfRetainedCheckpoints()); } @Test public void testSettingMaxNumberOfCheckpointsToRetain() throws Exception { final int maxNumberOfCheckpointsToRetain = 10; final Configuration jobManagerConfig = new Configuration(); jobManagerConfig.setInteger(CoreOptions.MAX_RETAINED_CHECKPOINTS, maxNumberOfCheckpointsToRetain); final ExecutionGraph eg = createExecutionGraph(jobManagerConfig); assertEquals(maxNumberOfCheckpointsToRetain, eg.getCheckpointCoordinator().getCheckpointStore().getMaxNumberOfRetainedCheckpoints()); } private Tuple2<ExecutionGraph, Map<ExecutionAttemptID, Execution>> setupExecution(JobVertex v1, int dop1, JobVertex v2, int dop2) throws Exception { final JobID jobId = new JobID(); v1.setParallelism(dop1); v2.setParallelism(dop2); v1.setInvokableClass(BatchTask.class); v2.setInvokableClass(BatchTask.class); Scheduler scheduler = new Scheduler(TestingUtils.defaultExecutionContext()); for (int i = 0; i < dop1 + dop2; i++) { scheduler.newInstanceAvailable( ExecutionGraphTestUtils.getInstance( new ActorTaskManagerGateway( new ExecutionGraphTestUtils.SimpleActorGateway( TestingUtils.directExecutionContext())))); } // execution graph that executes actions synchronously ExecutionGraph eg = new ExecutionGraph( new DirectScheduledExecutorService(), TestingUtils.defaultExecutor(), jobId, "some job", new Configuration(), new SerializedValue<>(new ExecutionConfig()), AkkaUtils.getDefaultTimeout(), new NoRestartStrategy(), scheduler); eg.setQueuedSchedulingAllowed(false); List<JobVertex> ordered = Arrays.asList(v1, v2); eg.attachJobGraph(ordered); assertEquals(dop1 + dop2, scheduler.getNumberOfAvailableSlots()); // schedule, this triggers mock deployment eg.scheduleForExecution(); Map<ExecutionAttemptID, Execution> executions = eg.getRegisteredExecutions(); assertEquals(dop1 + dop2, executions.size()); return new Tuple2<>(eg, executions); } @Test public void testSettingIllegalMaxNumberOfCheckpointsToRetain() throws Exception { final int negativeMaxNumberOfCheckpointsToRetain = -10; final Configuration jobManagerConfig = new Configuration(); jobManagerConfig.setInteger(CoreOptions.MAX_RETAINED_CHECKPOINTS, negativeMaxNumberOfCheckpointsToRetain); final ExecutionGraph eg = createExecutionGraph(jobManagerConfig); assertNotEquals(negativeMaxNumberOfCheckpointsToRetain, eg.getCheckpointCoordinator().getCheckpointStore().getMaxNumberOfRetainedCheckpoints()); assertEquals(CoreOptions.MAX_RETAINED_CHECKPOINTS.defaultValue().intValue(), eg.getCheckpointCoordinator().getCheckpointStore().getMaxNumberOfRetainedCheckpoints()); } @SuppressWarnings("serial") public static class FailingFinalizeJobVertex extends JobVertex { public FailingFinalizeJobVertex(String name, JobVertexID id) { super(name, id); } @Override public void finalizeOnMaster(ClassLoader cl) throws Exception { throw new Exception(); } } private ExecutionGraph createExecutionGraph(Configuration configuration) throws Exception { final ScheduledExecutorService executor = TestingUtils.defaultExecutor(); final JobID jobId = new JobID(); final JobGraph jobGraph = new JobGraph(jobId, "test"); jobGraph.setSnapshotSettings(new JobCheckpointingSettings( Collections.<JobVertexID>emptyList(), Collections.<JobVertexID>emptyList(), Collections.<JobVertexID>emptyList(), 100, 10 * 60 * 1000, 0, 1, ExternalizedCheckpointSettings.none(), null, false)); return ExecutionGraphBuilder.buildGraph( null, jobGraph, configuration, executor, executor, new ProgrammedSlotProvider(1), getClass().getClassLoader(), new StandaloneCheckpointRecoveryFactory(), Time.seconds(10), new NoRestartStrategy(), new UnregisteredMetricsGroup(), 1, LoggerFactory.getLogger(getClass())); } }