/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package gobblin.util; import java.io.IOException; import java.net.URI; import java.util.Collection; import java.util.List; import java.util.Map; import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; import lombok.extern.slf4j.Slf4j; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.slf4j.Logger; import com.google.common.base.Preconditions; import com.google.common.base.Strings; import com.google.common.cache.Cache; import com.google.common.cache.CacheBuilder; import com.google.common.collect.Lists; import com.google.common.io.Closer; import gobblin.configuration.ConfigurationKeys; import gobblin.configuration.State; import gobblin.source.workunit.MultiWorkUnit; import gobblin.source.workunit.WorkUnit; /** * Utility class for the job scheduler and job launchers. * * @author Yinan Li */ @Slf4j public class JobLauncherUtils { // A cache for proxied FileSystems by owners private static Cache<String, FileSystem> fileSystemCacheByOwners = CacheBuilder.newBuilder().build(); /** * Create a new job ID. * * @param jobName job name * @return new job ID */ public static String newJobId(String jobName) { return Id.Job.create(jobName, System.currentTimeMillis()).toString(); } /** * Create a new task ID for the job with the given job ID. * * @param jobId job ID * @param sequence task sequence number * @return new task ID */ public static String newTaskId(String jobId, int sequence) { return Id.Task.create(Id.parse(jobId).get(Id.Parts.INSTANCE_NAME), sequence).toString(); } /** * Create an ID for a new multi-task (corresponding to a {@link gobblin.source.workunit.MultiWorkUnit}) * for the job with the given job ID. * * @param jobId job ID * @param sequence multi-task sequence number * @return new multi-task ID */ public static String newMultiTaskId(String jobId, int sequence) { return Id.MultiTask.create(Id.parse(jobId).get(Id.Parts.INSTANCE_NAME), sequence).toString(); } /** * Utility method that takes in a {@link List} of {@link WorkUnit}s, and flattens them. It builds up * the flattened list by checking each element of the given list, and seeing if it is an instance of * {@link MultiWorkUnit}. If it is then it calls itself on the {@link WorkUnit}s returned by * {@link MultiWorkUnit#getWorkUnits()}. If not, then it simply adds the {@link WorkUnit} to the * flattened list. * * @param workUnits is a {@link List} containing either {@link WorkUnit}s or {@link MultiWorkUnit}s * @return a {@link List} of flattened {@link WorkUnit}s */ public static List<WorkUnit> flattenWorkUnits(Collection<WorkUnit> workUnits) { List<WorkUnit> flattenedWorkUnits = Lists.newArrayList(); for (WorkUnit workUnit : workUnits) { if (workUnit instanceof MultiWorkUnit) { flattenedWorkUnits.addAll(flattenWorkUnits(((MultiWorkUnit) workUnit).getWorkUnits())); } else { flattenedWorkUnits.add(workUnit); } } return flattenedWorkUnits; } /** * Cleanup the staging data for a list of Gobblin tasks. This method calls the * {@link #cleanTaskStagingData(State, Logger)} method. * * @param states a {@link List} of {@link State}s that need their staging data cleaned */ public static void cleanStagingData(List<? extends State> states, Logger logger) throws IOException { for (State state : states) { JobLauncherUtils.cleanTaskStagingData(state, logger); } } /** * Cleanup staging data of all tasks of a job. * * @param state a {@link State} instance storing job configuration properties * @param logger a {@link Logger} used for logging */ public static void cleanJobStagingData(State state, Logger logger) throws IOException { Preconditions.checkArgument(state.contains(ConfigurationKeys.WRITER_STAGING_DIR), "Missing required property " + ConfigurationKeys.WRITER_STAGING_DIR); Preconditions.checkArgument(state.contains(ConfigurationKeys.WRITER_OUTPUT_DIR), "Missing required property " + ConfigurationKeys.WRITER_OUTPUT_DIR); String writerFsUri = state.getProp(ConfigurationKeys.WRITER_FILE_SYSTEM_URI, ConfigurationKeys.LOCAL_FS_URI); FileSystem fs = getFsWithProxy(state, writerFsUri, WriterUtils.getFsConfiguration(state)); Path jobStagingPath = new Path(state.getProp(ConfigurationKeys.WRITER_STAGING_DIR)); logger.info("Cleaning up staging directory " + jobStagingPath); HadoopUtils.deletePath(fs, jobStagingPath, true); if (fs.exists(jobStagingPath.getParent()) && fs.listStatus(jobStagingPath.getParent()).length == 0) { logger.info("Deleting directory " + jobStagingPath.getParent()); HadoopUtils.deletePath(fs, jobStagingPath.getParent(), true); } Path jobOutputPath = new Path(state.getProp(ConfigurationKeys.WRITER_OUTPUT_DIR)); logger.info("Cleaning up output directory " + jobOutputPath); HadoopUtils.deletePath(fs, jobOutputPath, true); if (fs.exists(jobOutputPath.getParent()) && fs.listStatus(jobOutputPath.getParent()).length == 0) { logger.info("Deleting directory " + jobOutputPath.getParent()); HadoopUtils.deletePath(fs, jobOutputPath.getParent(), true); } if (state.contains(ConfigurationKeys.ROW_LEVEL_ERR_FILE)) { if (state.getPropAsBoolean(ConfigurationKeys.CLEAN_ERR_DIR, ConfigurationKeys.DEFAULT_CLEAN_ERR_DIR)) { Path jobErrPath = new Path(ConfigurationKeys.ROW_LEVEL_ERR_FILE); log.info("Cleaning up err directory : " + jobErrPath); HadoopUtils.deleteIfExists(fs, jobErrPath, true); } } } /** * Cleanup staging data of a Gobblin task. * * @param state a {@link State} instance storing task configuration properties * @param logger a {@link Logger} used for logging */ public static void cleanTaskStagingData(State state, Logger logger) throws IOException { int numBranches = state.getPropAsInt(ConfigurationKeys.FORK_BRANCHES_KEY, 1); for (int branchId = 0; branchId < numBranches; branchId++) { String writerFsUri = state.getProp( ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_FILE_SYSTEM_URI, numBranches, branchId), ConfigurationKeys.LOCAL_FS_URI); FileSystem fs = getFsWithProxy(state, writerFsUri, WriterUtils.getFsConfiguration(state)); Path stagingPath = WriterUtils.getWriterStagingDir(state, numBranches, branchId); if (fs.exists(stagingPath)) { logger.info("Cleaning up staging directory " + stagingPath.toUri().getPath()); if (!fs.delete(stagingPath, true)) { throw new IOException("Clean up staging directory " + stagingPath.toUri().getPath() + " failed"); } } Path outputPath = WriterUtils.getWriterOutputDir(state, numBranches, branchId); if (fs.exists(outputPath)) { logger.info("Cleaning up output directory " + outputPath.toUri().getPath()); if (!fs.delete(outputPath, true)) { throw new IOException("Clean up output directory " + outputPath.toUri().getPath() + " failed"); } } } } /** * Cleanup staging data of a Gobblin task using a {@link ParallelRunner}. * * @param state workunit state. * @param logger a {@link Logger} used for logging. * @param closer a closer that registers the given map of ParallelRunners. The caller is responsible * for closing the closer after the cleaning is done. * @param parallelRunners a map from FileSystem URI to ParallelRunner. * @throws IOException if it fails to cleanup the task staging data. */ public static void cleanTaskStagingData(State state, Logger logger, Closer closer, Map<String, ParallelRunner> parallelRunners) throws IOException { int numBranches = state.getPropAsInt(ConfigurationKeys.FORK_BRANCHES_KEY, 1); int parallelRunnerThreads = state.getPropAsInt(ParallelRunner.PARALLEL_RUNNER_THREADS_KEY, ParallelRunner.DEFAULT_PARALLEL_RUNNER_THREADS); for (int branchId = 0; branchId < numBranches; branchId++) { String writerFsUri = state.getProp( ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_FILE_SYSTEM_URI, numBranches, branchId), ConfigurationKeys.LOCAL_FS_URI); FileSystem fs = getFsWithProxy(state, writerFsUri, WriterUtils.getFsConfiguration(state)); ParallelRunner parallelRunner = getParallelRunner(fs, closer, parallelRunnerThreads, parallelRunners); Path stagingPath = WriterUtils.getWriterStagingDir(state, numBranches, branchId); if (fs.exists(stagingPath)) { logger.info("Cleaning up staging directory " + stagingPath.toUri().getPath()); parallelRunner.deletePath(stagingPath, true); } Path outputPath = WriterUtils.getWriterOutputDir(state, numBranches, branchId); if (fs.exists(outputPath)) { logger.info("Cleaning up output directory " + outputPath.toUri().getPath()); parallelRunner.deletePath(outputPath, true); } } } /** * @param state * @param fsUri * @return * @throws IOException */ private static FileSystem getFsWithProxy(final State state, final String fsUri, final Configuration conf) throws IOException { if (!state.getPropAsBoolean(ConfigurationKeys.SHOULD_FS_PROXY_AS_USER, ConfigurationKeys.DEFAULT_SHOULD_FS_PROXY_AS_USER)) { return FileSystem.get(URI.create(fsUri), conf); } Preconditions.checkArgument(!Strings.isNullOrEmpty(state.getProp(ConfigurationKeys.FS_PROXY_AS_USER_NAME)), "State does not contain a proper proxy user name"); String owner = state.getProp(ConfigurationKeys.FS_PROXY_AS_USER_NAME); try { return fileSystemCacheByOwners.get(owner, new Callable<FileSystem>() { @Override public FileSystem call() throws Exception { return new ProxiedFileSystemWrapper().getProxiedFileSystem(state, ProxiedFileSystemWrapper.AuthType.KEYTAB, state.getProp(ConfigurationKeys.SUPER_USER_KEY_TAB_LOCATION), fsUri, conf); } }); } catch (ExecutionException ee) { throw new IOException(ee.getCause()); } } private static ParallelRunner getParallelRunner(FileSystem fs, Closer closer, int parallelRunnerThreads, Map<String, ParallelRunner> parallelRunners) { String uriAndHomeDir = new Path(new Path(fs.getUri()), fs.getHomeDirectory()).toString(); if (!parallelRunners.containsKey(uriAndHomeDir)) { parallelRunners.put(uriAndHomeDir, closer.register(new ParallelRunner(parallelRunnerThreads, fs))); } return parallelRunners.get(uriAndHomeDir); } }