FailoverRegion.java example

Explorer
flink-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.runtime.executiongraph.failover;

import org.apache.flink.runtime.concurrent.AcceptFunction;
import org.apache.flink.runtime.concurrent.Future;
import org.apache.flink.runtime.concurrent.FutureUtils;
import org.apache.flink.runtime.executiongraph.Execution;
import org.apache.flink.runtime.executiongraph.ExecutionGraph;
import org.apache.flink.runtime.executiongraph.ExecutionVertex;
import org.apache.flink.runtime.executiongraph.GlobalModVersionMismatch;
import org.apache.flink.runtime.jobgraph.JobStatus;
import org.apache.flink.runtime.jobmanager.scheduler.CoLocationGroup;
import org.apache.flink.util.AbstractID;
import org.apache.flink.util.FlinkException;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.concurrent.Executor;
import java.util.concurrent.atomic.AtomicReferenceFieldUpdater;

import static org.apache.flink.util.Preconditions.checkNotNull;

/**
 * FailoverRegion manages the failover of a minimal pipeline connected sub graph.
 * It will change from CREATED to CANCELING and then to CANCELLED and at last to RUNNING,
 */
public class FailoverRegion {

	private static final AtomicReferenceFieldUpdater<FailoverRegion, JobStatus> STATE_UPDATER =
			AtomicReferenceFieldUpdater.newUpdater(FailoverRegion.class, JobStatus.class, "state");

	/** The log object used for debugging. */
	private static final Logger LOG = LoggerFactory.getLogger(FailoverRegion.class);

	// ------------------------------------------------------------------------

	/** a unique id for debugging */
	private final AbstractID id = new AbstractID();

	private final ExecutionGraph executionGraph;

	private final List<ExecutionVertex> connectedExecutionVertexes;

	/** The executor that executes the recovery action after all vertices are in a */
	private final Executor executor;

	/** Current status of the job execution */
	private volatile JobStatus state = JobStatus.RUNNING;


	public FailoverRegion(ExecutionGraph executionGraph, Executor executor, List<ExecutionVertex> connectedExecutions) {
		this.executionGraph = checkNotNull(executionGraph);
		this.executor = checkNotNull(executor);
		this.connectedExecutionVertexes = checkNotNull(connectedExecutions);

		LOG.debug("Created failover region {} with vertices: {}", id, connectedExecutions);
	}

	public void onExecutionFail(Execution taskExecution, Throwable cause) {
		// TODO: check if need to failover the preceding region
		if (!executionGraph.getRestartStrategy().canRestart()) {
			// delegate the failure to a global fail that will check the restart strategy and not restart
			executionGraph.failGlobal(cause);
		}
		else {
			cancel(taskExecution.getGlobalModVersion());
		}
	}

	private void allVerticesInTerminalState(long globalModVersionOfFailover) {
		while (true) {
			JobStatus curStatus = this.state;
			if (curStatus.equals(JobStatus.CANCELLING)) {
				if (transitionState(curStatus, JobStatus.CANCELED)) {
					reset(globalModVersionOfFailover);
					break;
				}
			}
			else {
				LOG.info("FailoverRegion {} is {} when allVerticesInTerminalState.", id, state);
				break;
			}
		}
	}

	public JobStatus getState() {
		return state;
	}

	/**
	 * get all execution vertexes contained in this region
	 */
	public List<ExecutionVertex> getAllExecutionVertexes() {
		return connectedExecutionVertexes;
	}

	// Notice the region to failover, 
	private void failover(long globalModVersionOfFailover) {
		if (!executionGraph.getRestartStrategy().canRestart()) {
			executionGraph.failGlobal(new FlinkException("RestartStrategy validate fail"));
		}
		else {
			JobStatus curStatus = this.state;
			if (curStatus.equals(JobStatus.RUNNING)) {
				cancel(globalModVersionOfFailover);
			}
			else if (curStatus.equals(JobStatus.CANCELED)) {
				reset(globalModVersionOfFailover);
			}
			else {
				LOG.info("FailoverRegion {} is {} when notified to failover.", id, state);
			}
		}
	}

	// cancel all executions in this sub graph
	private void cancel(final long globalModVersionOfFailover) {
		while (true) {
			JobStatus curStatus = this.state;
			if (curStatus.equals(JobStatus.RUNNING)) {
				if (transitionState(curStatus, JobStatus.CANCELLING)) {

					// we build a future that is complete once all vertices have reached a terminal state
					final ArrayList<Future<?>> futures = new ArrayList<>(connectedExecutionVertexes.size());

					// cancel all tasks (that still need cancelling)
					for (ExecutionVertex vertex : connectedExecutionVertexes) {
						futures.add(vertex.cancel());
					}

					final FutureUtils.ConjunctFuture<Void> allTerminal = FutureUtils.waitForAll(futures);
					allTerminal.thenAcceptAsync(new AcceptFunction<Void>() {
						@Override
						public void accept(Void value) {
							allVerticesInTerminalState(globalModVersionOfFailover);
						}
					}, executor);

					break;
				}
			}
			else {
				LOG.info("FailoverRegion {} is {} when cancel.", id, state);
				break;
			}
		}
	}

	// reset all executions in this sub graph
	private void reset(long globalModVersionOfFailover) {
		try {
			// reset all connected ExecutionVertexes
			final Collection<CoLocationGroup> colGroups = new HashSet<>();
			final long restartTimestamp = System.currentTimeMillis();

			for (ExecutionVertex ev : connectedExecutionVertexes) {
				CoLocationGroup cgroup = ev.getJobVertex().getCoLocationGroup();
				if (cgroup != null && !colGroups.contains(cgroup)){
					cgroup.resetConstraints();
					colGroups.add(cgroup);
				}

				ev.resetForNewExecution(restartTimestamp, globalModVersionOfFailover);
			}
			if (transitionState(JobStatus.CANCELED, JobStatus.CREATED)) {
				restart(globalModVersionOfFailover);
			}
			else {
				LOG.info("FailoverRegion {} switched from CANCELLING to CREATED fail, will fail this region again.", id);
				failover(globalModVersionOfFailover);
			}
		}
		catch (GlobalModVersionMismatch e) {
			// happens when a global recovery happens concurrently to the regional recovery
			// go back to a clean state
			state = JobStatus.RUNNING;
		}
		catch (Throwable e) {
			LOG.info("FailoverRegion {} reset fail, will failover again.", id);
			failover(globalModVersionOfFailover);
		}
	}

	// restart all executions in this sub graph
	private void restart(long globalModVersionOfFailover) {
		try {
			if (transitionState(JobStatus.CREATED, JobStatus.RUNNING)) {
				// if we have checkpointed state, reload it into the executions
				//TODO: checkpoint support restore part ExecutionVertex cp
				/**
				if (executionGraph.getCheckpointCoordinator() != null) {
					executionGraph.getCheckpointCoordinator().restoreLatestCheckpointedState(
							connectedExecutionVertexes, false, false);
				}
				*/
				//TODO, use restart strategy to schedule them.
				//restart all connected ExecutionVertexes
				for (ExecutionVertex ev : connectedExecutionVertexes) {
					try {
						ev.scheduleForExecution(
								executionGraph.getSlotProvider(),
								executionGraph.isQueuedSchedulingAllowed());
					}
					catch (Throwable e) {
						failover(globalModVersionOfFailover);
					}
				}
			}
			else {
				LOG.info("FailoverRegion {} switched from CREATED to RUNNING fail, will fail this region again.", id);
				failover(globalModVersionOfFailover);
			}
		} catch (Exception e) {
			LOG.info("FailoverRegion {} restart failed, failover again.", id, e);
			failover(globalModVersionOfFailover);
		}
	}

	private boolean transitionState(JobStatus current, JobStatus newState) {
		if (STATE_UPDATER.compareAndSet(this, current, newState)) {
			LOG.info("FailoverRegion {} switched from state {} to {}.", id, current, newState);
			return true;
		}
		else {
			return false;
		}
	}

}