/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.addthis.hydra.task.pipeline; import javax.annotation.Nonnull; import javax.annotation.Nullable; import java.io.IOException; import java.util.Arrays; import java.util.HashSet; import java.util.Set; import java.util.concurrent.CancellationException; import java.util.concurrent.CompletableFuture; import java.util.concurrent.CompletionException; import java.nio.file.Path; import com.addthis.hydra.task.map.StreamMapper; import com.addthis.hydra.task.run.TaskRunnable; import com.google.common.collect.ImmutableList; import com.google.common.collect.Sets; import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonProperty; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * <p>This is a <span class="hydra-summary">pipeline of one or more hydra jobs</span>. * It is specified with {@code type : "pipeline"}.</p> * <p>A pipeline job will run the first job phase to completion, then the second * phase, then the third phase, etc. When a pipeline job stops and is restarted * it begins processing from the first phase. It is recommended that phases use * the traditional mechanisms for processing data at most once, ie. the marks * directories.</p> * <p>Use the notation {@code ${hydra.task.jobid}} to retrieve the identifier * of the currently running job. This should be used by downstream phases to read * the files produced by an upstream phase.</p> * <p>By default error checking is enabled to verify that each job phase * does not write to an output directory of another job phase. If you want * to disable this error checking then set {@code validateDirs} to false.</p> * <p>All logging information is printed out using human (counting from 1) * numbering of the phases.</p> * <p>Example:</p> * <pre> * {pipeline.phases:[]} * </pre> * * @user-reference * @hydra-name pipeline */ public class PipelineTask implements TaskRunnable { private static final Logger log = LoggerFactory.getLogger(PipelineTask.class); @Nonnull private final StreamMapper[] phases; @Nullable private final boolean[] disable; /** * If true then ensure that writable directories are all unique. **/ private final boolean validateDirs; private final ImmutableList<CompletableFuture<Void>> phaseComplete; private final ImmutableList<CompletableFuture<Void>> phaseNext; private volatile StreamMapper currentPhase = null; @JsonCreator public PipelineTask(@JsonProperty("phases") @Nonnull StreamMapper[] phases, @JsonProperty("disable") boolean[] disable, @JsonProperty("validateDirs") boolean validateDirs) { this.phases = phases; this.validateDirs = validateDirs; this.disable = disable; if ((disable != null) && (disable.length != phases.length)) { throw new IllegalStateException("disable array is not of equal length as phases array"); } int futures = Math.max(phases.length - 1, 0); ImmutableList.Builder<CompletableFuture<Void>> complete = new ImmutableList.Builder<>(); ImmutableList.Builder<CompletableFuture<Void>> next = new ImmutableList.Builder<>(); for (int i = 0; i < futures; i++) { final int current = i; CompletableFuture<Void> phaseCompleteFuture = phases[i].getCompletionFuture(); CompletableFuture<Void> phaseNextFuture = phaseCompleteFuture.thenRun(() -> beginPhase(current + 1)); complete.add(phaseCompleteFuture); next.add(phaseNextFuture); } this.phaseComplete = complete.build(); this.phaseNext = next.build(); validateWritableRootPaths(); } @Override public void start() { beginPhase(0); } @Override public void close() throws Exception { log.info("Pipeline task is starting shutdown"); int size = phaseComplete.size(); boolean cancel[] = new boolean[size]; for (int i = (size - 1); i >= 0; i--) { cancel[i] = phaseComplete.get(i).cancel(false); } for (int i = (size - 1); i >= 0; i--) { try { if (!cancel[i]) { phaseNext.get(i).join(); } } catch (CompletionException ex) { String msg = "Phase " + (i + 1) + " phaseNext future encountered an " + "exception while starting phase " + (i + 2); throw new IOException(msg, ex); } catch (CancellationException ex) { String msg = "Race condition: Phase " + (i + 1) + " phaseNext " + "future was cancelled by another thread"; throw new IOException(msg, ex); } } /** * At this point all phaseNext futures have either * completed or have been cancelled. It is now * safe to close the current phase. */ if (currentPhase != null) { currentPhase.close(); } } /** * Begin a phase. Clears the {@code currentPhase} before initializing * the new phase and assigns {@code currentPhase} on successful * initialization. */ private void beginPhase(int pos) { if (pos >= phases.length) { return; } if ((disable != null) && (disable[pos])) { log.info("Skipping phase {} because it is disabled.", pos + 1); beginPhase(pos + 1); } else { log.info("Initializing phase {} for execution.", pos + 1); currentPhase = null; phases[pos].start(); currentPhase = phases[pos]; } } @Nonnull @Override public ImmutableList<Path> writableRootPaths() { ImmutableList.Builder<Path> builder = new ImmutableList.Builder<>(); for (int i = 0; i < phases.length; i++) { if ((disable != null) && disable[i]) { continue; } builder.addAll(phases[i].writableRootPaths()); } return builder.build(); } public void validateWritableRootPaths() { if (!validateDirs) { return; } for (StreamMapper phase : phases) { phase.validateWritableRootPaths(); } Set<Path>[] outputDirs = new Set[phases.length]; StringBuilder builder = new StringBuilder(); for (int i = 0; i < phases.length; i++) { if ((disable != null) && disable[i]) { continue; } outputDirs[i] = new HashSet<>(); outputDirs[i].addAll(phases[i].writableRootPaths()); for (int j = 0; j < i; j++) { Sets.SetView<Path> intersect = Sets.intersection(outputDirs[i], outputDirs[j]); if (intersect.size() > 0) { String message = String.format("Phases %d and %d have overlapping output directories: \"%s\"\n", (j + 1), (i + 1), intersect.toString()); builder.append(message); } } } if (builder.length() > 0) { throw new IllegalArgumentException(builder.toString()); } } }