/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.runtime.client;
import akka.actor.ActorRef;
import akka.actor.ActorSystem;
import akka.actor.Address;
import akka.actor.Identify;
import akka.actor.PoisonPill;
import akka.actor.Props;
import akka.pattern.Patterns;
import akka.util.Timeout;
import org.apache.flink.api.common.JobExecutionResult;
import org.apache.flink.api.common.JobID;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.runtime.akka.AkkaUtils;
import org.apache.flink.runtime.akka.ListeningBehaviour;
import org.apache.flink.runtime.blob.BlobCache;
import org.apache.flink.runtime.blob.BlobKey;
import org.apache.flink.runtime.execution.librarycache.FlinkUserCodeClassLoader;
import org.apache.flink.runtime.highavailability.HighAvailabilityServices;
import org.apache.flink.runtime.instance.ActorGateway;
import org.apache.flink.runtime.jobgraph.JobGraph;
import org.apache.flink.runtime.messages.JobClientMessages;
import org.apache.flink.runtime.messages.JobManagerMessages;
import org.apache.flink.runtime.util.SerializedThrowable;
import org.apache.flink.util.NetUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.Option;
import scala.Some;
import scala.Tuple2;
import scala.concurrent.Await;
import scala.concurrent.Future;
import scala.concurrent.duration.Duration;
import scala.concurrent.duration.FiniteDuration;
import java.io.IOException;
import java.net.InetAddress;
import java.net.InetSocketAddress;
import java.net.URL;
import java.util.Collection;
import java.util.concurrent.TimeoutException;
import static org.apache.flink.util.Preconditions.checkNotNull;
/**
* The JobClient bridges between the JobManager's asynchronous actor messages and
* the synchronous method calls to trigger.
*/
public class JobClient {
private static final Logger LOG = LoggerFactory.getLogger(JobClient.class);
public static ActorSystem startJobClientActorSystem(Configuration config)
throws IOException {
LOG.info("Starting JobClient actor system");
Option<Tuple2<String, Object>> remoting = new Some<>(new Tuple2<String, Object>("", 0));
// start a remote actor system to listen on an arbitrary port
ActorSystem system = AkkaUtils.createActorSystem(config, remoting);
Address address = system.provider().getDefaultAddress();
String hostAddress = address.host().isDefined() ?
NetUtils.ipAddressToUrlString(InetAddress.getByName(address.host().get())) :
"(unknown)";
int port = address.port().isDefined() ? ((Integer) address.port().get()) : -1;
LOG.info("Started JobClient actor system at " + hostAddress + ':' + port);
return system;
}
/**
* Submits a job to a Flink cluster (non-blocking) and returns a JobListeningContext which can be
* passed to {@code awaitJobResult} to get the result of the submission.
* @return JobListeningContext which may be used to retrieve the JobExecutionResult via
* {@code awaitJobResult(JobListeningContext context)}.
*/
public static JobListeningContext submitJob(
ActorSystem actorSystem,
Configuration config,
HighAvailabilityServices highAvailabilityServices,
JobGraph jobGraph,
FiniteDuration timeout,
boolean sysoutLogUpdates,
ClassLoader classLoader) {
checkNotNull(actorSystem, "The actorSystem must not be null.");
checkNotNull(highAvailabilityServices, "The high availability services must not be null.");
checkNotNull(jobGraph, "The jobGraph must not be null.");
checkNotNull(timeout, "The timeout must not be null.");
// for this job, we create a proxy JobClientActor that deals with all communication with
// the JobManager. It forwards the job submission, checks the success/failure responses, logs
// update messages, watches for disconnect between client and JobManager, ...
Props jobClientActorProps = JobSubmissionClientActor.createActorProps(
highAvailabilityServices.getJobManagerLeaderRetriever(HighAvailabilityServices.DEFAULT_JOB_ID),
timeout,
sysoutLogUpdates,
config);
ActorRef jobClientActor = actorSystem.actorOf(jobClientActorProps);
Future<Object> submissionFuture = Patterns.ask(
jobClientActor,
new JobClientMessages.SubmitJobAndWait(jobGraph),
new Timeout(AkkaUtils.INF_TIMEOUT()));
return new JobListeningContext(
jobGraph.getJobID(),
submissionFuture,
jobClientActor,
timeout,
classLoader,
highAvailabilityServices);
}
/**
* Attaches to a running Job using the JobID.
* Reconstructs the user class loader by downloading the jars from the JobManager.
*/
public static JobListeningContext attachToRunningJob(
JobID jobID,
ActorGateway jobManagerGateWay,
Configuration configuration,
ActorSystem actorSystem,
HighAvailabilityServices highAvailabilityServices,
FiniteDuration timeout,
boolean sysoutLogUpdates) {
checkNotNull(jobID, "The jobID must not be null.");
checkNotNull(jobManagerGateWay, "The jobManagerGateWay must not be null.");
checkNotNull(configuration, "The configuration must not be null.");
checkNotNull(actorSystem, "The actorSystem must not be null.");
checkNotNull(highAvailabilityServices, "The high availability services must not be null.");
checkNotNull(timeout, "The timeout must not be null.");
// we create a proxy JobClientActor that deals with all communication with
// the JobManager. It forwards the job attachments, checks the success/failure responses, logs
// update messages, watches for disconnect between client and JobManager, ...
Props jobClientActorProps = JobAttachmentClientActor.createActorProps(
highAvailabilityServices.getJobManagerLeaderRetriever(HighAvailabilityServices.DEFAULT_JOB_ID),
timeout,
sysoutLogUpdates);
ActorRef jobClientActor = actorSystem.actorOf(jobClientActorProps);
Future<Object> attachmentFuture = Patterns.ask(
jobClientActor,
new JobClientMessages.AttachToJobAndWait(jobID),
new Timeout(AkkaUtils.INF_TIMEOUT()));
return new JobListeningContext(
jobID,
attachmentFuture,
jobClientActor,
timeout,
actorSystem,
configuration,
highAvailabilityServices);
}
/**
* Reconstructs the class loader by first requesting information about it at the JobManager
* and then downloading missing jar files.
* @param jobID id of job
* @param jobManager gateway to the JobManager
* @param config the flink configuration
* @return A classloader that should behave like the original classloader
* @throws JobRetrievalException if anything goes wrong
*/
public static ClassLoader retrieveClassLoader(
JobID jobID,
ActorGateway jobManager,
Configuration config,
HighAvailabilityServices highAvailabilityServices)
throws JobRetrievalException {
final Object jmAnswer;
try {
jmAnswer = Await.result(
jobManager.ask(
new JobManagerMessages.RequestClassloadingProps(jobID),
AkkaUtils.getDefaultTimeoutAsFiniteDuration()),
AkkaUtils.getDefaultTimeoutAsFiniteDuration());
} catch (Exception e) {
throw new JobRetrievalException(jobID, "Couldn't retrieve class loading properties from JobManager.", e);
}
if (jmAnswer instanceof JobManagerMessages.ClassloadingProps) {
JobManagerMessages.ClassloadingProps props = ((JobManagerMessages.ClassloadingProps) jmAnswer);
Option<String> jmHost = jobManager.actor().path().address().host();
String jmHostname = jmHost.isDefined() ? jmHost.get() : "localhost";
InetSocketAddress serverAddress = new InetSocketAddress(jmHostname, props.blobManagerPort());
final BlobCache blobClient;
try {
// TODO: Fix lifecycle of BlobCache to properly close it upon usage
blobClient = new BlobCache(serverAddress, config, highAvailabilityServices.createBlobStore());
} catch (IOException e) {
throw new JobRetrievalException(jobID,
"Failed to setup blob cache", e);
}
final Collection<BlobKey> requiredJarFiles = props.requiredJarFiles();
final Collection<URL> requiredClasspaths = props.requiredClasspaths();
final URL[] allURLs = new URL[requiredJarFiles.size() + requiredClasspaths.size()];
int pos = 0;
for (BlobKey blobKey : props.requiredJarFiles()) {
try {
allURLs[pos++] = blobClient.getURL(blobKey);
} catch (Exception e) {
try {
blobClient.close();
} catch (IOException ioe) {
LOG.warn("Could not properly close the BlobClient.", ioe);
}
throw new JobRetrievalException(jobID, "Failed to download BlobKey " + blobKey, e);
}
}
for (URL url : requiredClasspaths) {
allURLs[pos++] = url;
}
return new FlinkUserCodeClassLoader(allURLs, JobClient.class.getClassLoader());
} else if (jmAnswer instanceof JobManagerMessages.JobNotFound) {
throw new JobRetrievalException(jobID, "Couldn't retrieve class loader. Job " + jobID + " not found");
} else {
throw new JobRetrievalException(jobID, "Unknown response from JobManager: " + jmAnswer);
}
}
/**
* Given a JobListeningContext, awaits the result of the job execution that this context is bound to
* @param listeningContext The listening context of the job execution
* @return The result of the execution
* @throws JobExecutionException if anything goes wrong while monitoring the job
*/
public static JobExecutionResult awaitJobResult(JobListeningContext listeningContext) throws JobExecutionException {
final JobID jobID = listeningContext.getJobID();
final ActorRef jobClientActor = listeningContext.getJobClientActor();
final Future<Object> jobSubmissionFuture = listeningContext.getJobResultFuture();
final FiniteDuration askTimeout = listeningContext.getTimeout();
// retrieves class loader if necessary
final ClassLoader classLoader = listeningContext.getClassLoader();
// wait for the future which holds the result to be ready
// ping the JobClientActor from time to time to check if it is still running
while (!jobSubmissionFuture.isCompleted()) {
try {
Await.ready(jobSubmissionFuture, askTimeout);
} catch (InterruptedException e) {
throw new JobExecutionException(
jobID,
"Interrupted while waiting for job completion.");
} catch (TimeoutException e) {
try {
Await.result(
Patterns.ask(
jobClientActor,
// Ping the Actor to see if it is alive
new Identify(true),
Timeout.durationToTimeout(askTimeout)),
askTimeout);
// we got a reply, continue waiting for the job result
} catch (Exception eInner) {
// we could have a result but the JobClientActor might have been killed and
// thus the health check failed
if (!jobSubmissionFuture.isCompleted()) {
throw new JobExecutionException(
jobID,
"JobClientActor seems to have died before the JobExecutionResult could be retrieved.",
eInner);
}
}
}
}
final Object answer;
try {
// we have already awaited the result, zero time to wait here
answer = Await.result(jobSubmissionFuture, Duration.Zero());
}
catch (Throwable throwable) {
throw new JobExecutionException(jobID,
"Couldn't retrieve the JobExecutionResult from the JobManager.", throwable);
}
finally {
// failsafe shutdown of the client actor
jobClientActor.tell(PoisonPill.getInstance(), ActorRef.noSender());
}
// second block handles the actual response
if (answer instanceof JobManagerMessages.JobResultSuccess) {
LOG.info("Job execution complete");
SerializedJobExecutionResult result = ((JobManagerMessages.JobResultSuccess) answer).result();
if (result != null) {
try {
return result.toJobExecutionResult(classLoader);
} catch (Throwable t) {
throw new JobExecutionException(jobID,
"Job was successfully executed but JobExecutionResult could not be deserialized.");
}
} else {
throw new JobExecutionException(jobID,
"Job was successfully executed but result contained a null JobExecutionResult.");
}
}
else if (answer instanceof JobManagerMessages.JobResultFailure) {
LOG.info("Job execution failed");
SerializedThrowable serThrowable = ((JobManagerMessages.JobResultFailure) answer).cause();
if (serThrowable != null) {
Throwable cause = serThrowable.deserializeError(classLoader);
if (cause instanceof JobExecutionException) {
throw (JobExecutionException) cause;
} else {
throw new JobExecutionException(jobID, "Job execution failed", cause);
}
} else {
throw new JobExecutionException(jobID,
"Job execution failed with null as failure cause.");
}
}
else if (answer instanceof JobManagerMessages.JobNotFound) {
throw new JobRetrievalException(
((JobManagerMessages.JobNotFound) answer).jobID(),
"Couldn't retrieve Job " + jobID + " because it was not running.");
}
else {
throw new JobExecutionException(jobID,
"Unknown answer from JobManager after submitting the job: " + answer);
}
}
/**
* Sends a [[JobGraph]] to the JobClient actor specified by jobClient which submits it then to
* the JobManager. The method blocks until the job has finished or the JobManager is no longer
* alive. In the former case, the [[SerializedJobExecutionResult]] is returned and in the latter
* case a [[JobExecutionException]] is thrown.
*
* @param actorSystem The actor system that performs the communication.
* @param config The cluster wide configuration.
* @param highAvailabilityServices Service factory for high availability services
* @param jobGraph JobGraph describing the Flink job
* @param timeout Timeout for futures
* @param sysoutLogUpdates prints log updates to system out if true
* @param classLoader The class loader for deserializing the results
* @return The job execution result
* @throws JobExecutionException Thrown if the job
* execution fails.
*/
public static JobExecutionResult submitJobAndWait(
ActorSystem actorSystem,
Configuration config,
HighAvailabilityServices highAvailabilityServices,
JobGraph jobGraph,
FiniteDuration timeout,
boolean sysoutLogUpdates,
ClassLoader classLoader) throws JobExecutionException {
JobListeningContext jobListeningContext = submitJob(
actorSystem,
config,
highAvailabilityServices,
jobGraph,
timeout,
sysoutLogUpdates,
classLoader);
return awaitJobResult(jobListeningContext);
}
/**
* Submits a job in detached mode. The method sends the JobGraph to the
* JobManager and waits for the answer whether the job could be started or not.
*
* @param jobManagerGateway Gateway to the JobManager which will execute the jobs
* @param config The cluster wide configuration.
* @param jobGraph The job
* @param timeout Timeout in which the JobManager must have responded.
*/
public static void submitJobDetached(
ActorGateway jobManagerGateway,
Configuration config,
JobGraph jobGraph,
FiniteDuration timeout,
ClassLoader classLoader) throws JobExecutionException {
checkNotNull(jobManagerGateway, "The jobManagerGateway must not be null.");
checkNotNull(jobGraph, "The jobGraph must not be null.");
checkNotNull(timeout, "The timeout must not be null.");
LOG.info("Checking and uploading JAR files");
try {
jobGraph.uploadUserJars(jobManagerGateway, timeout, config);
}
catch (IOException e) {
throw new JobSubmissionException(jobGraph.getJobID(),
"Could not upload the program's JAR files to the JobManager.", e);
}
Object result;
try {
Future<Object> future = jobManagerGateway.ask(
new JobManagerMessages.SubmitJob(
jobGraph,
ListeningBehaviour.DETACHED // only receive the Acknowledge for the job submission message
),
timeout);
result = Await.result(future, timeout);
}
catch (TimeoutException e) {
throw new JobTimeoutException(jobGraph.getJobID(),
"JobManager did not respond within " + timeout.toString(), e);
}
catch (Throwable t) {
throw new JobSubmissionException(jobGraph.getJobID(),
"Failed to send job to JobManager: " + t.getMessage(), t.getCause());
}
if (result instanceof JobManagerMessages.JobSubmitSuccess) {
JobID respondedID = ((JobManagerMessages.JobSubmitSuccess) result).jobId();
// validate response
if (!respondedID.equals(jobGraph.getJobID())) {
throw new JobExecutionException(jobGraph.getJobID(),
"JobManager responded for wrong Job. This Job: " +
jobGraph.getJobID() + ", response: " + respondedID);
}
}
else if (result instanceof JobManagerMessages.JobResultFailure) {
try {
SerializedThrowable t = ((JobManagerMessages.JobResultFailure) result).cause();
throw t.deserializeError(classLoader);
}
catch (JobExecutionException e) {
throw e;
}
catch (Throwable t) {
throw new JobExecutionException(jobGraph.getJobID(),
"JobSubmission failed: " + t.getMessage(), t);
}
}
else {
throw new JobExecutionException(jobGraph.getJobID(), "Unexpected response from JobManager: " + result);
}
}
}