/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.yarn;
import akka.actor.ActorRef;
import akka.actor.Props;
import akka.pattern.Patterns;
import akka.util.Timeout;
import org.apache.flink.api.common.JobID;
import org.apache.flink.api.common.JobSubmissionResult;
import org.apache.flink.client.program.ClusterClient;
import org.apache.flink.client.program.ProgramInvocationException;
import org.apache.flink.runtime.akka.AkkaUtils;
import org.apache.flink.runtime.clusterframework.ApplicationStatus;
import org.apache.flink.runtime.clusterframework.messages.GetClusterStatus;
import org.apache.flink.runtime.clusterframework.messages.GetClusterStatusResponse;
import org.apache.flink.runtime.clusterframework.messages.InfoMessage;
import org.apache.flink.runtime.clusterframework.messages.ShutdownClusterAfterJob;
import org.apache.flink.runtime.highavailability.HighAvailabilityServices;
import org.apache.flink.runtime.jobgraph.JobGraph;
import org.apache.flink.util.Preconditions;
import org.apache.flink.yarn.cli.FlinkYarnSessionCli;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.service.Service;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ApplicationReport;
import org.apache.hadoop.yarn.api.records.YarnApplicationState;
import org.apache.hadoop.yarn.client.api.YarnClient;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.Option;
import scala.concurrent.Await;
import scala.concurrent.Future;
import scala.concurrent.duration.FiniteDuration;
import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.atomic.AtomicBoolean;
/**
* Java representation of a running Flink cluster within YARN.
*/
public class YarnClusterClient extends ClusterClient {
private static final Logger LOG = LoggerFactory.getLogger(YarnClusterClient.class);
private static final int POLLING_THREAD_INTERVAL_MS = 1000;
private YarnClient yarnClient;
private Thread clientShutdownHook = new ClientShutdownHook();
private PollingThread pollingRunner;
private final Configuration hadoopConfig;
// (HDFS) location of the files required to run on YARN. Needed here to delete them on shutdown.
private final Path sessionFilesDir;
//---------- Class internal fields -------------------
private final AbstractYarnClusterDescriptor clusterDescriptor;
private final LazApplicationClientLoader applicationClient;
private final FiniteDuration akkaDuration;
private final ApplicationReport appReport;
private final ApplicationId appId;
private final String trackingURL;
private boolean isConnected = true;
/** Indicator whether this cluster has just been created */
private final boolean newlyCreatedCluster;
/**
* Create a new Flink on YARN cluster.
*
* @param clusterDescriptor The descriptor used at cluster creation
* @param yarnClient Client to talk to YARN
* @param appReport the YARN application ID
* @param flinkConfig Flink configuration
* @param sessionFilesDir Location of files required for YARN session
* @param newlyCreatedCluster Indicator whether this cluster has just been created
* @throws IOException
* @throws YarnException
*/
public YarnClusterClient(
final AbstractYarnClusterDescriptor clusterDescriptor,
final YarnClient yarnClient,
final ApplicationReport appReport,
org.apache.flink.configuration.Configuration flinkConfig,
Path sessionFilesDir,
boolean newlyCreatedCluster) throws Exception {
super(flinkConfig);
this.akkaDuration = AkkaUtils.getTimeout(flinkConfig);
this.clusterDescriptor = clusterDescriptor;
this.yarnClient = yarnClient;
this.hadoopConfig = yarnClient.getConfig();
this.sessionFilesDir = sessionFilesDir;
this.appReport = appReport;
this.appId = appReport.getApplicationId();
this.trackingURL = appReport.getTrackingUrl();
this.newlyCreatedCluster = newlyCreatedCluster;
this.applicationClient = new LazApplicationClientLoader(
flinkConfig,
actorSystemLoader,
highAvailabilityServices);
this.pollingRunner = new PollingThread(yarnClient, appId);
this.pollingRunner.setDaemon(true);
this.pollingRunner.start();
Runtime.getRuntime().addShutdownHook(clientShutdownHook);
}
/**
* Disconnect from the Yarn cluster
*/
public void disconnect() {
if (hasBeenShutDown.getAndSet(true)) {
return;
}
if(!isConnected) {
throw new IllegalStateException("Can not disconnect from an unconnected cluster.");
}
LOG.info("Disconnecting YarnClusterClient from ApplicationMaster");
try {
Runtime.getRuntime().removeShutdownHook(clientShutdownHook);
} catch (IllegalStateException e) {
// we are already in the shutdown hook
}
try {
pollingRunner.stopRunner();
pollingRunner.join(1000);
} catch(InterruptedException e) {
LOG.warn("Shutdown of the polling runner was interrupted", e);
Thread.currentThread().interrupt();
}
isConnected = false;
}
// -------------------------- Interaction with the cluster ------------------------
/*
* Tells the Cluster to monitor the status of JobId and stop itself once the specified job has finished.
*/
private void stopAfterJob(JobID jobID) {
Preconditions.checkNotNull(jobID, "The job id must not be null");
try {
Future<Object> replyFuture =
getJobManagerGateway().ask(
new ShutdownClusterAfterJob(jobID),
akkaDuration);
Await.ready(replyFuture, akkaDuration);
} catch (Exception e) {
throw new RuntimeException("Unable to tell application master to stop once the specified job has been finised", e);
}
}
@Override
public org.apache.flink.configuration.Configuration getFlinkConfiguration() {
return flinkConfig;
}
@Override
public int getMaxSlots() {
int maxSlots = clusterDescriptor.getTaskManagerCount() * clusterDescriptor.getTaskManagerSlots();
return maxSlots > 0 ? maxSlots : -1;
}
@Override
public boolean hasUserJarsInClassPath(List<URL> userJarFiles) {
return clusterDescriptor.hasUserJarFiles(userJarFiles);
}
@Override
protected JobSubmissionResult submitJob(JobGraph jobGraph, ClassLoader classLoader) throws ProgramInvocationException {
if (isDetached()) {
if (newlyCreatedCluster) {
stopAfterJob(jobGraph.getJobID());
}
return super.runDetached(jobGraph, classLoader);
} else {
return super.run(jobGraph, classLoader);
}
}
@Override
public String getWebInterfaceURL() {
// there seems to be a difference between HD 2.2.0 and 2.6.0
if(!trackingURL.startsWith("http://")) {
return "http://" + trackingURL;
} else {
return trackingURL;
}
}
@Override
public String getClusterIdentifier() {
return "Yarn cluster with application id " + appReport.getApplicationId();
}
/**
* This method is only available if the cluster hasn't been started in detached mode.
*/
@Override
public GetClusterStatusResponse getClusterStatus() {
if(!isConnected) {
throw new IllegalStateException("The cluster is not connected to the cluster.");
}
if(hasBeenShutdown()) {
throw new IllegalStateException("The cluster has already been shutdown.");
}
try {
final Future<Object> clusterStatusOption =
getJobManagerGateway().ask(
GetClusterStatus.getInstance(),
akkaDuration);
return (GetClusterStatusResponse) Await.result(clusterStatusOption, akkaDuration);
} catch (Exception e) {
throw new RuntimeException("Unable to get ClusterClient status from Application Client", e);
}
}
public ApplicationStatus getApplicationStatus() {
if(!isConnected) {
throw new IllegalStateException("The cluster has been connected to the ApplicationMaster.");
}
ApplicationReport lastReport = null;
if(pollingRunner == null) {
LOG.warn("YarnClusterClient.getApplicationStatus() has been called on an uninitialized cluster." +
"The system might be in an erroneous state");
} else {
lastReport = pollingRunner.getLastReport();
}
if(lastReport == null) {
LOG.warn("YarnClusterClient.getApplicationStatus() has been called on a cluster that didn't receive a status so far." +
"The system might be in an erroneous state");
return ApplicationStatus.UNKNOWN;
} else {
YarnApplicationState appState = lastReport.getYarnApplicationState();
ApplicationStatus status =
(appState == YarnApplicationState.FAILED || appState == YarnApplicationState.KILLED) ?
ApplicationStatus.FAILED : ApplicationStatus.SUCCEEDED;
if(status != ApplicationStatus.SUCCEEDED) {
LOG.warn("YARN reported application state {}", appState);
LOG.warn("Diagnostics: {}", lastReport.getDiagnostics());
}
return status;
}
}
@Override
public List<String> getNewMessages() {
if(hasBeenShutdown()) {
throw new RuntimeException("The YarnClusterClient has already been stopped");
}
if(!isConnected) {
throw new IllegalStateException("The cluster has been connected to the ApplicationMaster.");
}
List<String> ret = new ArrayList<String>();
// get messages from ApplicationClient (locally)
while(true) {
Object result;
try {
Future<Object> response =
Patterns.ask(
applicationClient.get(),
YarnMessages.getLocalGetYarnMessage(),
new Timeout(akkaDuration));
result = Await.result(response, akkaDuration);
} catch(Exception ioe) {
LOG.warn("Error retrieving the YARN messages locally", ioe);
break;
}
if(!(result instanceof Option)) {
throw new RuntimeException("LocalGetYarnMessage requires a response of type " +
"Option. Instead the response is of type " + result.getClass() + ".");
} else {
Option messageOption = (Option) result;
LOG.debug("Received message option {}", messageOption);
if(messageOption.isEmpty()) {
break;
} else {
Object obj = messageOption.get();
if(obj instanceof InfoMessage) {
InfoMessage msg = (InfoMessage) obj;
ret.add("[" + msg.date() + "] " + msg.message());
} else {
LOG.warn("LocalGetYarnMessage returned unexpected type: " + messageOption);
}
}
}
}
return ret;
}
// -------------------------- Shutdown handling ------------------------
private AtomicBoolean hasBeenShutDown = new AtomicBoolean(false);
/**
* Shuts down or disconnects from the YARN cluster.
*/
@Override
public void finalizeCluster() {
if (isDetached() || !newlyCreatedCluster) {
disconnect();
} else {
shutdownCluster();
}
}
/**
* Shuts down the Yarn application
*/
public void shutdownCluster() {
if (hasBeenShutDown.getAndSet(true)) {
return;
}
if (!isConnected) {
throw new IllegalStateException("The cluster has been not been connected to the ApplicationMaster.");
}
try {
Runtime.getRuntime().removeShutdownHook(clientShutdownHook);
} catch (IllegalStateException e) {
// we are already in the shutdown hook
}
LOG.info("Sending shutdown request to the Application Master");
try {
Future<Object> response =
Patterns.ask(applicationClient.get(),
new YarnMessages.LocalStopYarnSession(getApplicationStatus(),
"Flink YARN Client requested shutdown"),
new Timeout(akkaDuration));
Await.ready(response, akkaDuration);
} catch(Exception e) {
LOG.warn("Error while stopping YARN cluster.", e);
}
try {
File propertiesFile = FlinkYarnSessionCli.getYarnPropertiesLocation(flinkConfig);
if (propertiesFile.isFile()) {
if (propertiesFile.delete()) {
LOG.info("Deleted Yarn properties file at {}", propertiesFile.getAbsoluteFile().toString());
} else {
LOG.warn("Couldn't delete Yarn properties file at {}", propertiesFile.getAbsoluteFile().toString());
}
}
} catch (Exception e) {
LOG.warn("Exception while deleting the JobManager address file", e);
}
if (sessionFilesDir != null) {
LOG.info("Deleting files in " + sessionFilesDir);
try {
FileSystem shutFS = FileSystem.get(hadoopConfig);
shutFS.delete(sessionFilesDir, true); // delete conf and jar file.
shutFS.close();
} catch (IOException e) {
LOG.error("Could not delete the Flink jar and configuration files in HDFS..", e);
}
} else {
LOG.warn("Session file directory not set. Not deleting session files");
}
try {
pollingRunner.stopRunner();
pollingRunner.join(1000);
} catch(InterruptedException e) {
LOG.warn("Shutdown of the polling runner was interrupted", e);
Thread.currentThread().interrupt();
}
try {
ApplicationReport appReport = yarnClient.getApplicationReport(appId);
LOG.info("Application " + appId + " finished with state " + appReport
.getYarnApplicationState() + " and final state " + appReport
.getFinalApplicationStatus() + " at " + appReport.getFinishTime());
if (appReport.getYarnApplicationState() == YarnApplicationState.FAILED || appReport.getYarnApplicationState()
== YarnApplicationState.KILLED) {
LOG.warn("Application failed. Diagnostics " + appReport.getDiagnostics());
LOG.warn("If log aggregation is activated in the Hadoop cluster, we recommend to retrieve "
+ "the full application log using this command:"
+ System.lineSeparator()
+ "\tyarn logs -applicationId " + appReport.getApplicationId()
+ System.lineSeparator()
+ "(It sometimes takes a few seconds until the logs are aggregated)");
}
} catch (Exception e) {
LOG.warn("Couldn't get final report", e);
}
LOG.info("YARN Client is shutting down");
yarnClient.stop(); // actorRunner is using the yarnClient.
yarnClient = null; // set null to clearly see if somebody wants to access it afterwards.
}
public boolean hasBeenShutdown() {
return hasBeenShutDown.get();
}
private class ClientShutdownHook extends Thread {
@Override
public void run() {
LOG.info("Shutting down YarnClusterClient from the client shutdown hook");
try {
shutdown();
} catch (Throwable t) {
LOG.warn("Could not properly shut down the yarn cluster client.", t);
}
}
}
// -------------------------- Polling ------------------------
private static class PollingThread extends Thread {
AtomicBoolean running = new AtomicBoolean(true);
private YarnClient yarnClient;
private ApplicationId appId;
// ------- status information stored in the polling thread
private final Object lock = new Object();
private ApplicationReport lastReport;
public PollingThread(YarnClient yarnClient, ApplicationId appId) {
this.yarnClient = yarnClient;
this.appId = appId;
}
public void stopRunner() {
if(!running.get()) {
LOG.warn("Polling thread was already stopped");
}
running.set(false);
}
public ApplicationReport getLastReport() {
synchronized (lock) {
return lastReport;
}
}
@Override
public void run() {
while (running.get() && yarnClient.isInState(Service.STATE.STARTED)) {
try {
ApplicationReport report = yarnClient.getApplicationReport(appId);
synchronized (lock) {
lastReport = report;
}
} catch (Exception e) {
LOG.warn("Error while getting application report", e);
}
try {
Thread.sleep(YarnClusterClient.POLLING_THREAD_INTERVAL_MS);
} catch (InterruptedException e) {
LOG.error("Polling thread got interrupted", e);
Thread.currentThread().interrupt(); // pass interrupt.
stopRunner();
}
}
if(running.get() && !yarnClient.isInState(Service.STATE.STARTED)) {
// == if the polling thread is still running but the yarn client is stopped.
LOG.warn("YARN client is unexpected in state " + yarnClient.getServiceState());
}
}
}
@Override
public boolean isDetached() {
return super.isDetached() || clusterDescriptor.isDetachedMode();
}
/**
* Blocks until all TaskManagers are connected to the JobManager.
*/
@Override
public void waitForClusterToBeReady() {
logAndSysout("Waiting until all TaskManagers have connected");
for (GetClusterStatusResponse currentStatus, lastStatus = null;; lastStatus = currentStatus) {
currentStatus = getClusterStatus();
if (currentStatus != null && !currentStatus.equals(lastStatus)) {
logAndSysout("TaskManager status (" + currentStatus.numRegisteredTaskManagers() + "/"
+ clusterDescriptor.getTaskManagerCount() + ")");
if (currentStatus.numRegisteredTaskManagers() >= clusterDescriptor.getTaskManagerCount()) {
logAndSysout("All TaskManagers are connected");
break;
}
} else if (lastStatus == null) {
logAndSysout("No status updates from the YARN cluster received so far. Waiting ...");
}
try {
Thread.sleep(250);
} catch (InterruptedException e) {
throw new RuntimeException("Interrupted while waiting for TaskManagers", e);
}
}
}
public ApplicationId getApplicationId() {
return appId;
}
private static class LazApplicationClientLoader {
private final org.apache.flink.configuration.Configuration flinkConfig;
private final LazyActorSystemLoader actorSystemLoader;
private final HighAvailabilityServices highAvailabilityServices;
private ActorRef applicationClient;
private LazApplicationClientLoader(
org.apache.flink.configuration.Configuration flinkConfig,
LazyActorSystemLoader actorSystemLoader,
HighAvailabilityServices highAvailabilityServices) {
this.flinkConfig = Preconditions.checkNotNull(flinkConfig, "flinkConfig");
this.actorSystemLoader = Preconditions.checkNotNull(actorSystemLoader, "actorSystemLoader");
this.highAvailabilityServices = Preconditions.checkNotNull(highAvailabilityServices, "highAvailabilityServices");
}
/**
* Creates a new ApplicationClient actor or returns an existing one. May start an ActorSystem.
* @return ActorSystem
*/
public ActorRef get() {
if (applicationClient == null) {
// start application client
LOG.info("Start application client.");
applicationClient = actorSystemLoader.get().actorOf(
Props.create(
ApplicationClient.class,
flinkConfig,
highAvailabilityServices.getJobManagerLeaderRetriever(HighAvailabilityServices.DEFAULT_JOB_ID)),
"applicationClient");
}
return applicationClient;
}
}
}