/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package gobblin.util; import lombok.Data; import java.io.Closeable; import java.io.IOException; import java.util.Collection; import java.util.List; import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.locks.Lock; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileAlreadyExistsException; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Writable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.base.Optional; import com.google.common.collect.Lists; import com.google.common.util.concurrent.Striped; import gobblin.configuration.State; /** * A class that is responsible for running certain methods in parallel. Methods in this class returns immediately and * are run in a fixed-size thread pool. * * <p> * This class is intended to be used in the following pattern. This example uses the serialize() method. * * <pre> {@code * Closer closer = Closer.create(); * try { * // Do stuff * ParallelRunner runner = closer.register(new ParallelRunner(threads, fs)); * runner.serialize(state1, outputFilePath1); * // Submit more serialization tasks * runner.serialize(stateN, outputFilePathN); * // Do stuff * } catch (Throwable e) { * throw closer.rethrow(e); * } finally { * closer.close(); * }} * </pre> * * Note that calling {@link #close()} will wait for all submitted tasks to complete and then stop the * {@link ParallelRunner} by shutting down the {@link ExecutorService}. * </p> * * @author Yinan Li */ public class ParallelRunner implements Closeable { private static final Logger LOGGER = LoggerFactory.getLogger(ParallelRunner.class); public static final String PARALLEL_RUNNER_THREADS_KEY = "parallel.runner.threads"; public static final int DEFAULT_PARALLEL_RUNNER_THREADS = 10; private final ExecutorService executor; private final FileSystem fs; private final List<NamedFuture> futures = Lists.newArrayList(); private final Striped<Lock> locks = Striped.lazyWeakLock(Integer.MAX_VALUE); private final FailPolicy failPolicy; public ParallelRunner(int threads, FileSystem fs) { this(threads, fs, FailPolicy.FAIL_ONE_FAIL_ALL); } public ParallelRunner(int threads, FileSystem fs, FailPolicy failPolicy) { this.executor = ExecutorsUtils.loggingDecorator(Executors.newFixedThreadPool(threads, ExecutorsUtils.newThreadFactory(Optional.of(LOGGER), Optional.of("ParallelRunner")))); this.fs = fs; this.failPolicy = failPolicy; } /** * Policies indicating how {@link ParallelRunner} should handle failure of tasks. */ public static enum FailPolicy { /** If a task fails, a warning will be logged, but the {@link ParallelRunner} will still succeed.*/ ISOLATE_FAILURES, /** If a task fails, all tasks will be tried, but {@link ParallelRunner#close} will throw the Exception.*/ FAIL_ONE_FAIL_ALL } /** * A future with a name / message for reporting. */ @Data public static class NamedFuture { private final Future<?> future; private final String name; } /** * Serialize a {@link State} object into a file. * * <p> * This method submits a task to serialize the {@link State} object and returns immediately * after the task is submitted. * </p> * * @param state the {@link State} object to be serialized * @param outputFilePath the file to write the serialized {@link State} object to * @param <T> the {@link State} object type */ public <T extends State> void serializeToFile(final T state, final Path outputFilePath) { // Use a Callable with a Void return type to allow exceptions to be thrown this.futures.add(new NamedFuture(this.executor.submit(new Callable<Void>() { @Override public Void call() throws Exception { SerializationUtils.serializeState(ParallelRunner.this.fs, outputFilePath, state); return null; } }), "Serialize state to " + outputFilePath)); } /** * Deserialize a {@link State} object from a file. * * <p> * This method submits a task to deserialize the {@link State} object and returns immediately * after the task is submitted. * </p> * * @param state an empty {@link State} object to which the deserialized content will be populated * @param inputFilePath the input file to read from * @param <T> the {@link State} object type */ public <T extends State> void deserializeFromFile(final T state, final Path inputFilePath) { this.futures.add(new NamedFuture(this.executor.submit(new Callable<Void>() { @Override public Void call() throws Exception { SerializationUtils.deserializeState(ParallelRunner.this.fs, inputFilePath, state); return null; } }), "Deserialize state from " + inputFilePath)); } /** * Deserialize a list of {@link State} objects from a Hadoop {@link SequenceFile}. * * <p> * This method submits a task to deserialize the {@link State} objects and returns immediately * after the task is submitted. * </p> * * @param stateClass the {@link Class} object of the {@link State} class * @param inputFilePath the input {@link SequenceFile} to read from * @param states a {@link Collection} object to store the deserialized {@link State} objects * @param deleteAfter a flag telling whether to delete the {@link SequenceFile} afterwards * @param <T> the {@link State} object type */ public <T extends State> void deserializeFromSequenceFile(final Class<? extends Writable> keyClass, final Class<T> stateClass, final Path inputFilePath, final Collection<T> states, final boolean deleteAfter) { this.futures.add(new NamedFuture(this.executor.submit(new Callable<Void>() { @Override public Void call() throws Exception { Configuration conf = new Configuration(ParallelRunner.this.fs.getConf()); WritableShimSerialization.addToHadoopConfiguration(conf); try (@SuppressWarnings("deprecation") SequenceFile.Reader reader = new SequenceFile.Reader( ParallelRunner.this.fs, inputFilePath, conf)) { Writable key = keyClass.newInstance(); T state = stateClass.newInstance(); while (reader.next(key)) { state = (T) reader.getCurrentValue(state); states.add(state); state = stateClass.newInstance(); } if (deleteAfter) { HadoopUtils.deletePath(ParallelRunner.this.fs, inputFilePath, false); } } return null; } }), "Deserialize state from file " + inputFilePath)); } /** * Delete a {@link Path}. * * <p> * This method submits a task to delete a {@link Path} and returns immediately * after the task is submitted. * </p> * * @param path path to be deleted. */ public void deletePath(final Path path, final boolean recursive) { this.futures.add(new NamedFuture(this.executor.submit(new Callable<Void>() { @Override public Void call() throws Exception { Lock lock = ParallelRunner.this.locks.get(path.toString()); lock.lock(); try { HadoopUtils.deletePath(ParallelRunner.this.fs, path, recursive); return null; } finally { lock.unlock(); } } }), "Delete path " + path)); } /** * Rename a {@link Path}. * * <p> * This method submits a task to rename a {@link Path} and returns immediately * after the task is submitted. * </p> * * @param src path to be renamed * @param dst new path after rename * @param group an optional group name for the destination path */ public void renamePath(final Path src, final Path dst, final Optional<String> group) { this.futures.add(new NamedFuture(this.executor.submit(new Callable<Void>() { @Override public Void call() throws Exception { Lock lock = ParallelRunner.this.locks.get(src.toString()); lock.lock(); try { if (ParallelRunner.this.fs.exists(src)) { HadoopUtils.renamePath(ParallelRunner.this.fs, src, dst); if (group.isPresent()) { HadoopUtils.setGroup(ParallelRunner.this.fs, dst, group.get()); } } return null; } catch (FileAlreadyExistsException e) { LOGGER.warn(String.format("Failed to rename %s to %s: dst already exists", src, dst), e); return null; } finally { lock.unlock(); } } }), "Rename " + src + " to " + dst)); } /** * Move a {@link Path}. * * <p> * This method submits a task to move a {@link Path} and returns immediately * after the task is submitted. * </p> * * @param src path to be moved * @param dstFs the destination {@link FileSystem} * @param dst the destination path * @param group an optional group name for the destination path */ public void movePath(final Path src, final FileSystem dstFs, final Path dst, final Optional<String> group) { movePath(src, dstFs, dst, false, group); } /** * Move a {@link Path}. * * <p> * This method submits a task to move a {@link Path} and returns immediately * after the task is submitted. * </p> * * @param src path to be moved * @param dstFs the destination {@link FileSystem} * @param dst the destination path * @param overwrite true to overwrite the destination * @param group an optional group name for the destination path */ public void movePath(final Path src, final FileSystem dstFs, final Path dst, final boolean overwrite, final Optional<String> group) { this.futures.add(new NamedFuture(this.executor.submit(new Callable<Void>() { @Override public Void call() throws Exception { Lock lock = ParallelRunner.this.locks.get(src.toString()); lock.lock(); try { if (ParallelRunner.this.fs.exists(src)) { HadoopUtils.movePath(ParallelRunner.this.fs, src, dstFs, dst, overwrite, dstFs.getConf()); if (group.isPresent()) { HadoopUtils.setGroup(dstFs, dst, group.get()); } } return null; } catch (FileAlreadyExistsException e) { LOGGER.warn(String.format("Failed to move %s to %s: dst already exists", src, dst), e); return null; } finally { lock.unlock(); } } }), "Move " + src + " to " + dst)); } /** * Submit a callable to the thread pool * * <p> * This method submits a task and returns immediately * </p> * * @param callable the callable to submit * @param name for the future */ public void submitCallable(Callable<Void> callable, String name) { this.futures.add(new NamedFuture(this.executor.submit(callable), name)); } @Override public void close() throws IOException { // Wait for all submitted tasks to complete try { boolean wasInterrupted = false; IOException exception = null; for (NamedFuture future : this.futures) { try { if (wasInterrupted) { future.getFuture().cancel(true); } else { future.getFuture().get(); } } catch (InterruptedException ie) { LOGGER.warn("Task was interrupted: " + future.getName()); wasInterrupted = true; if (exception == null) { exception = new IOException(ie); } } catch (ExecutionException ee) { LOGGER.warn("Task failed: " + future.getName(), ee.getCause()); if (exception == null) { exception = new IOException(ee.getCause()); } } } if (wasInterrupted) { Thread.currentThread().interrupt(); } if (exception != null && this.failPolicy == FailPolicy.FAIL_ONE_FAIL_ALL) { throw exception; } } finally { ExecutorsUtils.shutdownExecutorService(this.executor, Optional.of(LOGGER)); } } }