/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.beam.runners.dataflow; import static com.google.common.base.MoreObjects.firstNonNull; import static org.apache.beam.runners.dataflow.util.TimeUtil.fromCloudTime; import com.google.api.client.googleapis.json.GoogleJsonResponseException; import com.google.api.client.util.BackOff; import com.google.api.client.util.BackOffUtils; import com.google.api.client.util.NanoClock; import com.google.api.client.util.Sleeper; import com.google.api.services.dataflow.model.Job; import com.google.api.services.dataflow.model.JobMessage; import com.google.api.services.dataflow.model.MetricUpdate; import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.BiMap; import com.google.common.collect.HashBiMap; import com.google.common.collect.ImmutableMap; import java.io.IOException; import java.net.SocketTimeoutException; import java.util.List; import java.util.Map; import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; import java.util.concurrent.FutureTask; import java.util.concurrent.atomic.AtomicReference; import javax.annotation.Nullable; import org.apache.beam.runners.dataflow.options.DataflowPipelineOptions; import org.apache.beam.runners.dataflow.util.MonitoringUtil; import org.apache.beam.sdk.PipelineResult; import org.apache.beam.sdk.metrics.MetricResults; import org.apache.beam.sdk.runners.AppliedPTransform; import org.apache.beam.sdk.util.BackOffAdapter; import org.apache.beam.sdk.util.FluentBackoff; import org.joda.time.Duration; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * A DataflowPipelineJob represents a job submitted to Dataflow using * {@link DataflowRunner}. */ public class DataflowPipelineJob implements PipelineResult { private static final Logger LOG = LoggerFactory.getLogger(DataflowPipelineJob.class); /** * The id for the job. */ protected String jobId; /** * The {@link DataflowPipelineOptions} for the job. */ private final DataflowPipelineOptions dataflowOptions; /** * Client for the Dataflow service. This can be used to query the service * for information about the job. */ private final DataflowClient dataflowClient; /** * MetricResults object for Dataflow Runner. It allows for querying of metrics from the Dataflow * service. */ private final DataflowMetrics dataflowMetrics; /** * The state the job terminated in or {@code null} if the job has not terminated. */ @Nullable private State terminalState = null; /** * The job that replaced this one or {@code null} if the job has not been replaced. */ @Nullable private DataflowPipelineJob replacedByJob = null; protected BiMap<AppliedPTransform<?, ?, ?>, String> transformStepNames; /** * The Metric Updates retrieved after the job was in a terminal state. */ private List<MetricUpdate> terminalMetricUpdates; /** * The latest timestamp up to which job messages have been retrieved. */ private long lastTimestamp = Long.MIN_VALUE; /** * The polling interval for job status and messages information. */ static final Duration MESSAGES_POLLING_INTERVAL = Duration.standardSeconds(2); static final Duration STATUS_POLLING_INTERVAL = Duration.standardSeconds(2); static final double DEFAULT_BACKOFF_EXPONENT = 1.5; /** * The amount of polling retries for job status and messages information. */ static final int MESSAGES_POLLING_RETRIES = 11; static final int STATUS_POLLING_RETRIES = 4; private static final FluentBackoff MESSAGES_BACKOFF_FACTORY = FluentBackoff.DEFAULT .withInitialBackoff(MESSAGES_POLLING_INTERVAL) .withMaxRetries(MESSAGES_POLLING_RETRIES) .withExponent(DEFAULT_BACKOFF_EXPONENT); protected static final FluentBackoff STATUS_BACKOFF_FACTORY = FluentBackoff.DEFAULT .withInitialBackoff(STATUS_POLLING_INTERVAL) .withMaxRetries(STATUS_POLLING_RETRIES) .withExponent(DEFAULT_BACKOFF_EXPONENT); /** * Constructs the job. * * @param jobId the job id * @param dataflowOptions used to configure the client for the Dataflow Service * @param transformStepNames a mapping from AppliedPTransforms to Step Names */ public DataflowPipelineJob( DataflowClient dataflowClient, String jobId, DataflowPipelineOptions dataflowOptions, Map<AppliedPTransform<?, ?, ?>, String> transformStepNames) { this.dataflowClient = dataflowClient; this.jobId = jobId; this.dataflowOptions = dataflowOptions; this.transformStepNames = HashBiMap.create( firstNonNull(transformStepNames, ImmutableMap.<AppliedPTransform<?, ?, ?>, String>of())); this.dataflowMetrics = new DataflowMetrics(this, this.dataflowClient); } /** * Get the id of this job. */ public String getJobId() { return jobId; } /** * Get the project this job exists in. */ public String getProjectId() { return dataflowOptions.getProject(); } /** * Returns a new {@link DataflowPipelineJob} for the job that replaced this one, if applicable. * * @throws IllegalStateException if called before the job has terminated or if the job terminated * but was not updated */ public DataflowPipelineJob getReplacedByJob() { if (terminalState == null) { throw new IllegalStateException("getReplacedByJob() called before job terminated"); } if (replacedByJob == null) { throw new IllegalStateException("getReplacedByJob() called for job that was not replaced"); } return replacedByJob; } @Override @Nullable public State waitUntilFinish() { return waitUntilFinish(Duration.millis(-1)); } @Override @Nullable public State waitUntilFinish(Duration duration) { try { return waitUntilFinish( duration, new MonitoringUtil.LoggingHandler()); } catch (Exception e) { if (e instanceof InterruptedException) { Thread.currentThread().interrupt(); } if (e instanceof RuntimeException) { throw (RuntimeException) e; } throw new RuntimeException(e); } } /** * Waits until the pipeline finishes and returns the final status. * * @param duration The time to wait for the job to finish. * Provide a value less than 1 ms for an infinite wait. * * @param messageHandler If non null this handler will be invoked for each * batch of messages received. * @return The final state of the job or null on timeout or if the * thread is interrupted. * @throws IOException If there is a persistent problem getting job * information. */ @Nullable @VisibleForTesting public State waitUntilFinish( Duration duration, MonitoringUtil.JobMessagesHandler messageHandler) throws IOException, InterruptedException { // We ignore the potential race condition here (Ctrl-C after job submission but before the // shutdown hook is registered). Even if we tried to do something smarter (eg., SettableFuture) // the run method (which produces the job) could fail or be Ctrl-C'd before it had returned a // job. The display of the command to cancel the job is best-effort anyways -- RPC's could fail, // etc. If the user wants to verify the job was cancelled they should look at the job status. Thread shutdownHook = new Thread() { @Override public void run() { LOG.warn("Job is already running in Google Cloud Platform, Ctrl-C will not cancel it.\n" + "To cancel the job in the cloud, run:\n> {}", MonitoringUtil.getGcloudCancelCommand(dataflowOptions, getJobId())); } }; try { Runtime.getRuntime().addShutdownHook(shutdownHook); return waitUntilFinish( duration, messageHandler, Sleeper.DEFAULT, NanoClock.SYSTEM, new MonitoringUtil(dataflowClient)); } finally { Runtime.getRuntime().removeShutdownHook(shutdownHook); } } @Nullable @VisibleForTesting State waitUntilFinish( Duration duration, @Nullable MonitoringUtil.JobMessagesHandler messageHandler, Sleeper sleeper, NanoClock nanoClock) throws IOException, InterruptedException { return waitUntilFinish( duration, messageHandler, sleeper, nanoClock, new MonitoringUtil(dataflowClient)); } /** * Waits until the pipeline finishes and returns the final status. * * @param duration The time to wait for the job to finish. * Provide a value less than 1 ms for an infinite wait. * * @param messageHandler If non null this handler will be invoked for each * batch of messages received. * @param sleeper A sleeper to use to sleep between attempts. * @param nanoClock A nanoClock used to time the total time taken. * @return The final state of the job or null on timeout. * @throws IOException If there is a persistent problem getting job * information. * @throws InterruptedException if the thread is interrupted. */ @Nullable @VisibleForTesting State waitUntilFinish( Duration duration, @Nullable MonitoringUtil.JobMessagesHandler messageHandler, Sleeper sleeper, NanoClock nanoClock, MonitoringUtil monitor) throws IOException, InterruptedException { BackOff backoff; if (!duration.isLongerThan(Duration.ZERO)) { backoff = BackOffAdapter.toGcpBackOff(MESSAGES_BACKOFF_FACTORY.backoff()); } else { backoff = BackOffAdapter.toGcpBackOff( MESSAGES_BACKOFF_FACTORY.withMaxCumulativeBackoff(duration).backoff()); } // This function tracks the cumulative time from the *first request* to enforce the wall-clock // limit. Any backoff instance could, at best, track the the time since the first attempt at a // given request. Thus, we need to track the cumulative time ourselves. long startNanos = nanoClock.nanoTime(); State state; do { // Get the state of the job before listing messages. This ensures we always fetch job // messages after the job finishes to ensure we have all them. state = getStateWithRetries( BackOffAdapter.toGcpBackOff( STATUS_BACKOFF_FACTORY.withMaxRetries(0).backoff()), sleeper); boolean hasError = state == State.UNKNOWN; if (messageHandler != null && !hasError) { // Process all the job messages that have accumulated so far. try { List<JobMessage> allMessages = monitor.getJobMessages( jobId, lastTimestamp); if (!allMessages.isEmpty()) { lastTimestamp = fromCloudTime(allMessages.get(allMessages.size() - 1).getTime()).getMillis(); messageHandler.process(allMessages); } } catch (GoogleJsonResponseException | SocketTimeoutException e) { hasError = true; LOG.warn("There were problems getting current job messages: {}.", e.getMessage()); LOG.debug("Exception information:", e); } } if (!hasError) { // We can stop if the job is done. if (state.isTerminal()) { switch (state) { case DONE: case CANCELLED: LOG.info("Job {} finished with status {}.", getJobId(), state); break; case UPDATED: LOG.info("Job {} has been updated and is running as the new job with id {}. " + "To access the updated job on the Dataflow monitoring console, " + "please navigate to {}", getJobId(), getReplacedByJob().getJobId(), MonitoringUtil.getJobMonitoringPageURL( getReplacedByJob().getProjectId(), getReplacedByJob().getJobId())); break; default: LOG.info("Job {} failed with status {}.", getJobId(), state); } return state; } // The job is not done, so we must keep polling. backoff.reset(); // If a total duration for all backoff has been set, update the new cumulative sleep time to // be the remaining total backoff duration, stopping if we have already exceeded the // allotted time. if (duration.isLongerThan(Duration.ZERO)) { long nanosConsumed = nanoClock.nanoTime() - startNanos; Duration consumed = Duration.millis((nanosConsumed + 999999) / 1000000); Duration remaining = duration.minus(consumed); if (remaining.isLongerThan(Duration.ZERO)) { backoff = BackOffAdapter.toGcpBackOff( MESSAGES_BACKOFF_FACTORY.withMaxCumulativeBackoff(remaining).backoff()); } else { // If there is no time remaining, don't bother backing off. backoff = BackOff.STOP_BACKOFF; } } } } while(BackOffUtils.next(sleeper, backoff)); LOG.warn("No terminal state was returned. State value {}", state); return null; // Timed out. } private AtomicReference<FutureTask<State>> cancelState = new AtomicReference<>(); @Override public State cancel() throws IOException { // Enforce that a cancel() call on the job is done at most once - as // a workaround for Dataflow service's current bugs with multiple // cancellation, where it may sometimes return an error when cancelling // a job that was already cancelled, but still report the job state as // RUNNING. // To partially work around these issues, we absorb duplicate cancel() // calls. This, of course, doesn't address the case when the job terminates // externally almost concurrently to calling cancel(), but at least it // makes it possible to safely call cancel() multiple times and from // multiple threads in one program. FutureTask<State> tentativeCancelTask = new FutureTask<>(new Callable<State>() { @Override public State call() throws Exception { Job content = new Job(); content.setProjectId(getProjectId()); content.setId(jobId); content.setRequestedState("JOB_STATE_CANCELLED"); try { Job job = dataflowClient.updateJob(jobId, content); return MonitoringUtil.toState(job.getCurrentState()); } catch (IOException e) { State state = getState(); if (state.isTerminal()) { LOG.warn("Cancel failed because job is already terminated. State is {}", state); return state; } else if (e.getMessage().contains("has terminated")) { // This handles the case where the getState() call above returns RUNNING but the cancel // was rejected because the job is in fact done. Hopefully, someday we can delete this // code if there is better consistency between the State and whether Cancel succeeds. // // Example message: // Workflow modification failed. Causes: (7603adc9e9bff51e): Cannot perform // operation 'cancel' on Job: 2017-04-01_22_50_59-9269855660514862348. Job has // terminated in state SUCCESS: Workflow job: 2017-04-01_22_50_59-9269855660514862348 // succeeded. LOG.warn("Cancel failed because job is already terminated.", e); return state; } else { String errorMsg = String.format( "Failed to cancel job in state %s, " + "please go to the Developers Console to cancel it manually: %s", state, MonitoringUtil.getJobMonitoringPageURL(getProjectId(), getJobId())); LOG.warn(errorMsg); throw new IOException(errorMsg, e); } } } }); if (cancelState.compareAndSet(null, tentativeCancelTask)) { // This thread should perform cancellation, while others will // only wait for the result. cancelState.get().run(); } try { return cancelState.get().get(); } catch (InterruptedException | ExecutionException e) { throw new IOException(e); } } @Override public State getState() { if (terminalState != null) { return terminalState; } return getStateWithRetries( BackOffAdapter.toGcpBackOff(STATUS_BACKOFF_FACTORY.backoff()), Sleeper.DEFAULT); } /** * Attempts to get the state. Uses exponential backoff on failure up to the maximum number * of passed in attempts. * * @param attempts The amount of attempts to make. * @param sleeper Object used to do the sleeps between attempts. * @return The state of the job or State.UNKNOWN in case of failure. */ @VisibleForTesting State getStateWithRetries(BackOff attempts, Sleeper sleeper) { if (terminalState != null) { return terminalState; } try { Job job = getJobWithRetries(attempts, sleeper); return MonitoringUtil.toState(job.getCurrentState()); } catch (IOException exn) { // The only IOException that getJobWithRetries is permitted to throw is the final IOException // that caused the failure of retry. Other exceptions are wrapped in an unchecked exceptions // and will propagate. return State.UNKNOWN; } } /** * Attempts to get the underlying {@link Job}. Uses exponential backoff on failure up to the * maximum number of passed in attempts. * * @param backoff the {@link BackOff} used to control retries. * @param sleeper Object used to do the sleeps between attempts. * @return The underlying {@link Job} object. * @throws IOException When the maximum number of retries is exhausted, the last exception is * thrown. */ private Job getJobWithRetries(BackOff backoff, Sleeper sleeper) throws IOException { // Retry loop ends in return or throw while (true) { try { Job job = dataflowClient.getJob(jobId); State currentState = MonitoringUtil.toState(job.getCurrentState()); if (currentState.isTerminal()) { terminalState = currentState; replacedByJob = new DataflowPipelineJob( dataflowClient, job.getReplacedByJobId(), dataflowOptions, transformStepNames); } return job; } catch (IOException exn) { LOG.warn("There were problems getting current job status: {}.", exn.getMessage()); LOG.debug("Exception information:", exn); if (!nextBackOff(sleeper, backoff)) { throw exn; } } } } /** * Identical to {@link BackOffUtils#next} but without checked exceptions. */ private boolean nextBackOff(Sleeper sleeper, BackOff backoff) { try { return BackOffUtils.next(sleeper, backoff); } catch (InterruptedException | IOException e) { if (e instanceof InterruptedException) { Thread.currentThread().interrupt(); } throw new RuntimeException(e); } } @Override public MetricResults metrics() { return dataflowMetrics; } }