/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.yarn.highavailability;
import org.apache.flink.api.common.JobID;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.configuration.IllegalConfigurationException;
import org.apache.flink.runtime.highavailability.HighAvailabilityServices;
import org.apache.flink.runtime.highavailability.ServicesThreadFactory;
import org.apache.flink.runtime.highavailability.nonha.leaderelection.SingleLeaderElectionService;
import org.apache.flink.runtime.leaderelection.LeaderElectionService;
import org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService;
import java.io.IOException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
/**
* These YarnHighAvailabilityServices are for the Application Master in setups where there is one
* ResourceManager that is statically configured in the Flink configuration.
*
* <h3>Handled failure types</h3>
* <ul>
* <li><b>User code & operator failures:</b> Failed operators are recovered from checkpoints.</li>
* <li><b>Task Manager Failures:</b> Failed Task Managers are restarted and their tasks are
* recovered from checkpoints.</li>
* </ul>
*
* <h3>Non-recoverable failure types</h3>
* <ul>
* <li><b>Application Master failures:</b> These failures cannot be recovered, because TaskManagers
* have no way to discover the new Application Master's address.</li>
* </ul>
*
* <p>Internally, these services put their recovery data into YARN's working directory,
* except for checkpoints, which are in the configured checkpoint directory. That way,
* checkpoints can be resumed with a new job/application, even if the complete YARN application
* is killed and cleaned up.
*
* <p>Because ResourceManager and JobManager run both in the same process (Application Master), they
* use an embedded leader election service to find each other.
*
* <p>A typical YARN setup that uses these HA services first starts the ResourceManager
* inside the ApplicationMaster and puts its RPC endpoint address into the configuration with which
* the TaskManagers are started. Because of this static addressing scheme, the setup cannot handle failures
* of the JobManager and ResourceManager, which are running as part of the Application Master.
*
* @see HighAvailabilityServices
*/
public class YarnIntraNonHaMasterServices extends AbstractYarnNonHaServices {
/** The dispatcher thread pool for these services */
private final ExecutorService dispatcher;
/** The embedded leader election service used by JobManagers to find the resource manager */
private final SingleLeaderElectionService resourceManagerLeaderElectionService;
// ------------------------------------------------------------------------
/**
* Creates new YarnIntraNonHaMasterServices for the given Flink and YARN configuration.
*
* This constructor initializes access to the HDFS to store recovery data, and creates the
* embedded leader election services through which ResourceManager and JobManager find and
* confirm each other.
*
* @param config The Flink configuration of this component / process.
* @param hadoopConf The Hadoop configuration for the YARN cluster.
*
* @throws IOException
* Thrown, if the initialization of the Hadoop file system used by YARN fails.
* @throws IllegalConfigurationException
* Thrown, if the Flink configuration does not properly describe the ResourceManager address and port.
*/
public YarnIntraNonHaMasterServices(
Configuration config,
org.apache.hadoop.conf.Configuration hadoopConf) throws IOException {
super(config, hadoopConf);
// track whether we successfully perform the initialization
boolean successful = false;
try {
this.dispatcher = Executors.newSingleThreadExecutor(new ServicesThreadFactory());
this.resourceManagerLeaderElectionService = new SingleLeaderElectionService(dispatcher, DEFAULT_LEADER_ID);
// all good!
successful = true;
}
finally {
if (!successful) {
// quietly undo what the parent constructor initialized
try {
super.close();
} catch (Throwable ignored) {}
}
}
}
// ------------------------------------------------------------------------
// Services
// ------------------------------------------------------------------------
@Override
public LeaderRetrievalService getResourceManagerLeaderRetriever() {
enter();
try {
return resourceManagerLeaderElectionService.createLeaderRetrievalService();
}
finally {
exit();
}
}
@Override
public LeaderElectionService getResourceManagerLeaderElectionService() {
enter();
try {
return resourceManagerLeaderElectionService;
}
finally {
exit();
}
}
@Override
public LeaderElectionService getJobManagerLeaderElectionService(JobID jobID) {
enter();
try {
throw new UnsupportedOperationException("needs refactoring to accept default address");
}
finally {
exit();
}
}
@Override
public LeaderRetrievalService getJobManagerLeaderRetriever(JobID jobID) {
enter();
try {
throw new UnsupportedOperationException("needs refactoring to accept default address");
}
finally {
exit();
}
}
// ------------------------------------------------------------------------
// shutdown
// ------------------------------------------------------------------------
@Override
public void close() throws Exception {
if (enterUnlessClosed()) {
try {
try {
// this class' own cleanup logic
resourceManagerLeaderElectionService.shutdown();
dispatcher.shutdownNow();
}
finally {
// in any case must we call the parent cleanup logic
super.close();
}
}
finally {
exit();
}
}
}
}