/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package gobblin.util; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.util.Collection; import java.util.List; import java.util.Map.Entry; import java.util.Properties; import java.util.Queue; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileAlreadyExistsException; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.LocalFileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.RawLocalFileSystem; import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.Writable; import org.apache.hadoop.util.ReflectionUtils; import com.google.common.base.Optional; import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableSortedSet; import com.google.common.collect.Lists; import com.google.common.collect.Queues; import com.google.common.io.BaseEncoding; import com.typesafe.config.Config; import com.typesafe.config.ConfigFactory; import com.typesafe.config.ConfigValue; import lombok.AllArgsConstructor; import lombok.extern.slf4j.Slf4j; import gobblin.configuration.ConfigurationKeys; import gobblin.configuration.State; import gobblin.util.deprecation.DeprecationUtils; import gobblin.util.executors.ScalingThreadPoolExecutor; import gobblin.writer.DataWriter; /** * A utility class for working with Hadoop. */ @Slf4j public class HadoopUtils { public static final String HDFS_ILLEGAL_TOKEN_REGEX = "[\\s:\\\\]"; /** * A {@link Collection} of all known {@link FileSystem} schemes that do not support atomic renames or copies. * * <p> * The following important properties are useful to remember when writing code that is compatible with S3: * <ul> * <li>Renames are not atomic, and require copying the entire source file to the destination file</li> * <li>Writes to S3 using {@link FileSystem#create(Path)} will first go to the local filesystem, when the stream * is closed the local file will be uploaded to S3</li> * </ul> * </p> */ public static final Collection<String> FS_SCHEMES_NON_ATOMIC = ImmutableSortedSet.orderedBy(String.CASE_INSENSITIVE_ORDER).add("s3").add("s3a").add("s3n").build(); public static final String MAX_FILESYSTEM_QPS = "filesystem.throttling.max.filesystem.qps"; private static final List<String> DEPRECATED_KEYS = Lists.newArrayList("gobblin.copy.max.filesystem.qps"); private static final int MAX_RENAME_TRIES = 3; public static Configuration newConfiguration() { Configuration conf = new Configuration(); // Explicitly check for S3 environment variables, so that Hadoop can access s3 and s3n URLs. // h/t https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala String awsAccessKeyId = System.getenv("AWS_ACCESS_KEY_ID"); String awsSecretAccessKey = System.getenv("AWS_SECRET_ACCESS_KEY"); if (awsAccessKeyId != null && awsSecretAccessKey != null) { conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId); conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey); conf.set("fs.s3n.awsAccessKeyId", awsAccessKeyId); conf.set("fs.s3n.awsSecretAccessKey", awsSecretAccessKey); } // Add a new custom filesystem mapping conf.set("fs.sftp.impl", "gobblin.source.extractor.extract.sftp.SftpLightWeightFileSystem"); conf.set("fs.sftp.impl.disable.cache", "true"); return conf; } /** * @deprecated Use {@link FileListUtils#listFilesRecursively(FileSystem, Path)}. */ @Deprecated public static List<FileStatus> listStatusRecursive(FileSystem fileSystem, Path path) throws IOException { List<FileStatus> results = Lists.newArrayList(); walk(results, fileSystem, path); return results; } /** * Get the path as a string without schema or authority. * * E.g. Converts sftp://user/data/file.txt to /user/data/file.txt */ public static String toUriPath(Path path) { return path.toUri().getPath(); } /** * A wrapper around {@link FileSystem#delete(Path, boolean)} which throws {@link IOException} if the given * {@link Path} exists, and {@link FileSystem#delete(Path, boolean)} returns False. */ public static void deletePath(FileSystem fs, Path f, boolean recursive) throws IOException { if (fs.exists(f) && !fs.delete(f, recursive)) { throw new IOException("Failed to delete: " + f); } } /** * A wrapper around {@link FileSystem#delete(Path, boolean)} that only deletes a given {@link Path} if it is present * on the given {@link FileSystem}. */ public static void deleteIfExists(FileSystem fs, Path path, boolean recursive) throws IOException { if (fs.exists(path)) { deletePath(fs, path, recursive); } } public static void deletePathAndEmptyAncestors(FileSystem fs, Path f, boolean recursive) throws IOException { deletePath(fs, f, recursive); Path parent = f.getParent(); while (parent != null) { if (fs.exists(parent) && fs.listStatus(parent).length == 0) { deletePath(fs, parent, true); parent = parent.getParent(); } else { break; } } } /** * Renames a src {@link Path} on fs {@link FileSystem} to a dst {@link Path}. If fs is a {@link LocalFileSystem} and * src is a directory then {@link File#renameTo} is called directly to avoid a directory rename race condition where * {@link org.apache.hadoop.fs.RawLocalFileSystem#rename} copies the conflicting src directory into dst resulting in * an extra nested level, such as /root/a/b/c/e/e where e is repeated. * * @param fs the {@link FileSystem} where the src {@link Path} exists * @param src the source {@link Path} which will be renamed * @param dst the {@link Path} to rename to * @return true if rename succeeded, false if rename failed. * @throws IOException if rename failed for reasons other than target exists. */ public static boolean renamePathHandleLocalFSRace(FileSystem fs, Path src, Path dst) throws IOException { if (DecoratorUtils.resolveUnderlyingObject(fs) instanceof LocalFileSystem && fs.isDirectory(src)) { LocalFileSystem localFs = (LocalFileSystem) DecoratorUtils.resolveUnderlyingObject(fs); File srcFile = localFs.pathToFile(src); File dstFile = localFs.pathToFile(dst); return srcFile.renameTo(dstFile); } else { return fs.rename(src, dst); } } /** * A wrapper around {@link FileSystem#rename(Path, Path)} which throws {@link IOException} if * {@link FileSystem#rename(Path, Path)} returns False. */ public static void renamePath(FileSystem fs, Path oldName, Path newName) throws IOException { renamePath(fs, oldName, newName, false); } /** * A wrapper around {@link FileSystem#rename(Path, Path)} which throws {@link IOException} if * {@link FileSystem#rename(Path, Path)} returns False. */ public static void renamePath(FileSystem fs, Path oldName, Path newName, boolean overwrite) throws IOException { if (!fs.exists(oldName)) { throw new FileNotFoundException(String.format("Failed to rename %s to %s: src not found", oldName, newName)); } if (fs.exists(newName)) { if (overwrite) { if (!fs.delete(newName, true)) { throw new IOException( String.format("Failed to delete %s while renaming %s to %s", newName, oldName, newName)); } } else { throw new FileAlreadyExistsException( String.format("Failed to rename %s to %s: dst already exists", oldName, newName)); } } if (!fs.rename(oldName, newName)) { throw new IOException(String.format("Failed to rename %s to %s", oldName, newName)); } } /** * Moves a src {@link Path} from a srcFs {@link FileSystem} to a dst {@link Path} on a dstFs {@link FileSystem}. If * the srcFs and the dstFs have the same scheme, and neither of them or S3 schemes, then the {@link Path} is simply * renamed. Otherwise, the data is from the src {@link Path} to the dst {@link Path}. So this method can handle copying * data between different {@link FileSystem} implementations. * * @param srcFs the source {@link FileSystem} where the src {@link Path} exists * @param src the source {@link Path} which will me moved * @param dstFs the destination {@link FileSystem} where the dst {@link Path} should be created * @param dst the {@link Path} to move data to */ public static void movePath(FileSystem srcFs, Path src, FileSystem dstFs, Path dst, Configuration conf) throws IOException { movePath(srcFs, src, dstFs, dst, false, conf); } /** * Moves a src {@link Path} from a srcFs {@link FileSystem} to a dst {@link Path} on a dstFs {@link FileSystem}. If * the srcFs and the dstFs have the same scheme, and neither of them or S3 schemes, then the {@link Path} is simply * renamed. Otherwise, the data is from the src {@link Path} to the dst {@link Path}. So this method can handle copying * data between different {@link FileSystem} implementations. * * @param srcFs the source {@link FileSystem} where the src {@link Path} exists * @param src the source {@link Path} which will me moved * @param dstFs the destination {@link FileSystem} where the dst {@link Path} should be created * @param dst the {@link Path} to move data to * @param overwrite true if the destination should be overwritten; otherwise, false */ public static void movePath(FileSystem srcFs, Path src, FileSystem dstFs, Path dst, boolean overwrite, Configuration conf) throws IOException { if (srcFs.getUri().getScheme().equals(dstFs.getUri().getScheme()) && !FS_SCHEMES_NON_ATOMIC.contains(srcFs.getUri().getScheme()) && !FS_SCHEMES_NON_ATOMIC.contains(dstFs.getUri().getScheme())) { renamePath(srcFs, src, dst); } else { copyPath(srcFs, src, dstFs, dst, true, overwrite, conf); } } /** * Copies data from a src {@link Path} to a dst {@link Path}. * * <p> * This method should be used in preference to * {@link FileUtil#copy(FileSystem, Path, FileSystem, Path, boolean, boolean, Configuration)}, which does not handle * clean up of incomplete files if there is an error while copying data. * </p> * * <p> * TODO this method does not handle cleaning up any local files leftover by writing to S3. * </p> * * @param srcFs the source {@link FileSystem} where the src {@link Path} exists * @param src the {@link Path} to copy from the source {@link FileSystem} * @param dstFs the destination {@link FileSystem} where the dst {@link Path} should be created * @param dst the {@link Path} to copy data to */ public static void copyPath(FileSystem srcFs, Path src, FileSystem dstFs, Path dst, Configuration conf) throws IOException { copyPath(srcFs, src, dstFs, dst, false, false, conf); } /** * Copies data from a src {@link Path} to a dst {@link Path}. * * <p> * This method should be used in preference to * {@link FileUtil#copy(FileSystem, Path, FileSystem, Path, boolean, boolean, Configuration)}, which does not handle * clean up of incomplete files if there is an error while copying data. * </p> * * <p> * TODO this method does not handle cleaning up any local files leftover by writing to S3. * </p> * * @param srcFs the source {@link FileSystem} where the src {@link Path} exists * @param src the {@link Path} to copy from the source {@link FileSystem} * @param dstFs the destination {@link FileSystem} where the dst {@link Path} should be created * @param dst the {@link Path} to copy data to * @param overwrite true if the destination should be overwritten; otherwise, false */ public static void copyPath(FileSystem srcFs, Path src, FileSystem dstFs, Path dst, boolean overwrite, Configuration conf) throws IOException { copyPath(srcFs, src, dstFs, dst, false, overwrite, conf); } private static void copyPath(FileSystem srcFs, Path src, FileSystem dstFs, Path dst, boolean deleteSource, boolean overwrite, Configuration conf) throws IOException { Preconditions.checkArgument(srcFs.exists(src), String.format("Cannot copy from %s to %s because src does not exist", src, dst)); Preconditions.checkArgument(overwrite || !dstFs.exists(dst), String.format("Cannot copy from %s to %s because dst exists", src, dst)); try { boolean isSourceFileSystemLocal = srcFs instanceof LocalFileSystem || srcFs instanceof RawLocalFileSystem; if (isSourceFileSystemLocal) { try { dstFs.copyFromLocalFile(deleteSource, overwrite, src, dst); } catch (IOException e) { throw new IOException(String.format("Failed to copy %s to %s", src, dst), e); } } else if (!FileUtil.copy(srcFs, src, dstFs, dst, deleteSource, overwrite, conf)) { throw new IOException(String.format("Failed to copy %s to %s", src, dst)); } } catch (Throwable t1) { try { deleteIfExists(dstFs, dst, true); } catch (Throwable t2) { // Do nothing } throw t1; } } /** * Copies a src {@link Path} from a srcFs {@link FileSystem} to a dst {@link Path} on a dstFs {@link FileSystem}. If * either the srcFs or dstFs are S3 {@link FileSystem}s (as dictated by {@link #FS_SCHEMES_NON_ATOMIC}) then data is directly * copied from the src to the dst. Otherwise data is first copied to a tmp {@link Path}, which is then renamed to the * dst. * * @param srcFs the source {@link FileSystem} where the src {@link Path} exists * @param src the {@link Path} to copy from the source {@link FileSystem} * @param dstFs the destination {@link FileSystem} where the dst {@link Path} should be created * @param dst the {@link Path} to copy data to * @param tmp the temporary {@link Path} to use when copying data * @param overwriteDst true if the destination and tmp path should should be overwritten, false otherwise */ public static void copyFile(FileSystem srcFs, Path src, FileSystem dstFs, Path dst, Path tmp, boolean overwriteDst, Configuration conf) throws IOException { Preconditions.checkArgument(srcFs.isFile(src), String.format("Cannot copy from %s to %s because src is not a file", src, dst)); if (FS_SCHEMES_NON_ATOMIC.contains(srcFs.getUri().getScheme()) || FS_SCHEMES_NON_ATOMIC.contains(dstFs.getUri().getScheme())) { copyFile(srcFs, src, dstFs, dst, overwriteDst, conf); } else { copyFile(srcFs, src, dstFs, tmp, overwriteDst, conf); try { boolean renamed = false; if (overwriteDst && dstFs.exists(dst)) { try { deletePath(dstFs, dst, true); } finally { renamePath(dstFs, tmp, dst); renamed = true; } } if (!renamed) { renamePath(dstFs, tmp, dst); } } finally { deletePath(dstFs, tmp, true); } } } /** * Copy a file from a srcFs {@link FileSystem} to a dstFs {@link FileSystem}. The src {@link Path} must be a file, * that is {@link FileSystem#isFile(Path)} must return true for src. * * <p> * If overwrite is specified to true, this method may delete the dst directory even if the copy from src to dst fails. * </p> * * @param srcFs the src {@link FileSystem} to copy the file from * @param src the src {@link Path} to copy * @param dstFs the destination {@link FileSystem} to write to * @param dst the destination {@link Path} to write to * @param overwrite true if the dst {@link Path} should be overwritten, false otherwise */ public static void copyFile(FileSystem srcFs, Path src, FileSystem dstFs, Path dst, boolean overwrite, Configuration conf) throws IOException { Preconditions.checkArgument(srcFs.isFile(src), String.format("Cannot copy from %s to %s because src is not a file", src, dst)); Preconditions.checkArgument(overwrite || !dstFs.exists(dst), String.format("Cannot copy from %s to %s because dst exists", src, dst)); try (InputStream in = srcFs.open(src); OutputStream out = dstFs.create(dst, overwrite)) { IOUtils.copyBytes(in, out, conf, false); } catch (Throwable t1) { try { deleteIfExists(dstFs, dst, true); } catch (Throwable t2) { // Do nothing } throw t1; } } private static void walk(List<FileStatus> results, FileSystem fileSystem, Path path) throws IOException { for (FileStatus status : fileSystem.listStatus(path)) { if (!status.isDirectory()) { results.add(status); } else { walk(results, fileSystem, status.getPath()); } } } /** * This method is an additive implementation of the {@link FileSystem#rename(Path, Path)} method. It moves all the * files/directories under 'from' path to the 'to' path without overwriting existing directories in the 'to' path. * * <p> * The rename operation happens at the first non-existent sub-directory. If a directory at destination path already * exists, it recursively tries to move sub-directories. If all the sub-directories also exist at the destination, * a file level move is done * </p> * * @param fileSystem on which the data needs to be moved * @param from path of the data to be moved * @param to path of the data to be moved */ public static void renameRecursively(FileSystem fileSystem, Path from, Path to) throws IOException { log.info(String.format("Recursively renaming %s in %s to %s.", from, fileSystem.getUri(), to)); FileSystem throttledFS = getOptionallyThrottledFileSystem(fileSystem, 10000); ExecutorService executorService = ScalingThreadPoolExecutor.newScalingThreadPool(1, 100, 100, ExecutorsUtils.newThreadFactory(Optional.of(log), Optional.of("rename-thread-%d"))); Queue<Future<?>> futures = Queues.newConcurrentLinkedQueue(); try { if (!fileSystem.exists(from)) { throw new IOException("Trying to rename a path that does not exist! " + from); } futures.add(executorService .submit(new RenameRecursively(throttledFS, fileSystem.getFileStatus(from), to, executorService, futures))); int futuresUsed = 0; while (!futures.isEmpty()) { try { futures.poll().get(); futuresUsed++; } catch (ExecutionException | InterruptedException ee) { throw new IOException(ee.getCause()); } } log.info(String.format("Recursive renaming of %s to %s. (details: used %d futures)", from, to, futuresUsed)); } finally { ExecutorsUtils.shutdownExecutorService(executorService, Optional.of(log), 1, TimeUnit.SECONDS); } } /** * Calls {@link #getOptionallyThrottledFileSystem(FileSystem, int)} parsing the qps from the input {@link State} * at key {@link #MAX_FILESYSTEM_QPS}. * @throws IOException */ public static FileSystem getOptionallyThrottledFileSystem(FileSystem fs, State state) throws IOException { DeprecationUtils.renameDeprecatedKeys(state, MAX_FILESYSTEM_QPS, DEPRECATED_KEYS); if (state.contains(MAX_FILESYSTEM_QPS)) { return getOptionallyThrottledFileSystem(fs, state.getPropAsInt(MAX_FILESYSTEM_QPS)); } return fs; } /** * Get a throttled {@link FileSystem} that limits the number of queries per second to a {@link FileSystem}. If * the input qps is <= 0, no such throttling will be performed. * @throws IOException */ public static FileSystem getOptionallyThrottledFileSystem(FileSystem fs, int qpsLimit) throws IOException { if (fs instanceof Decorator) { for (Object obj : DecoratorUtils.getDecoratorLineage(fs)) { if (obj instanceof RateControlledFileSystem) { // Already rate controlled return fs; } } } if (qpsLimit > 0) { try { RateControlledFileSystem newFS = new RateControlledFileSystem(fs, qpsLimit); newFS.startRateControl(); return newFS; } catch (ExecutionException ee) { throw new IOException("Could not create throttled FileSystem.", ee); } } return fs; } @AllArgsConstructor private static class RenameRecursively implements Runnable { private final FileSystem fileSystem; private final FileStatus from; private final Path to; private final ExecutorService executorService; private final Queue<Future<?>> futures; @Override public void run() { try { // Attempt to move safely if directory, unsafely if file (for performance, files are much less likely to collide on target) boolean moveSucessful = this.from.isDirectory() ? safeRenameIfNotExists(this.fileSystem, this.from.getPath(), this.to) : unsafeRenameIfNotExists(this.fileSystem, this.from.getPath(), this.to); if (!moveSucessful) { if (this.from.isDirectory()) { for (FileStatus fromFile : this.fileSystem.listStatus(this.from.getPath())) { Path relativeFilePath = new Path(StringUtils.substringAfter(fromFile.getPath().toString(), this.from.getPath().toString() + Path.SEPARATOR)); Path toFilePath = new Path(this.to, relativeFilePath); this.futures.add(this.executorService.submit( new RenameRecursively(this.fileSystem, fromFile, toFilePath, this.executorService, this.futures))); } } else { log.info(String.format("File already exists %s. Will not rewrite", this.to)); } } } catch (IOException ioe) { throw new RuntimeException(ioe); } } } /** * Renames from to to if to doesn't exist in a thread-safe way. This method is necessary because * {@link FileSystem#rename} is inconsistent across file system implementations, e.g. in some of them rename(foo, bar) * will create bar/foo if bar already existed, but it will only create bar if it didn't. * * <p> * The thread-safety is only guaranteed among calls to this method. An external modification to the relevant * target directory could still cause unexpected results in the renaming. * </p> * * @param fs filesystem where rename will be executed. * @param from origin {@link Path}. * @param to target {@link Path}. * @return true if rename succeeded, false if the target already exists. * @throws IOException if rename failed for reasons other than target exists. */ public synchronized static boolean safeRenameIfNotExists(FileSystem fs, Path from, Path to) throws IOException { return unsafeRenameIfNotExists(fs, from, to); } /** * Renames from to to if to doesn't exist in a non-thread-safe way. * * @param fs filesystem where rename will be executed. * @param from origin {@link Path}. * @param to target {@link Path}. * @return true if rename succeeded, false if the target already exists. * @throws IOException if rename failed for reasons other than target exists. */ public static boolean unsafeRenameIfNotExists(FileSystem fs, Path from, Path to) throws IOException { if (!fs.exists(to)) { if (!fs.exists(to.getParent())) { fs.mkdirs(to.getParent()); } if (!renamePathHandleLocalFSRace(fs, from, to)) { if (!fs.exists(to)) { throw new IOException(String.format("Failed to rename %s to %s.", from, to)); } return false; } return true; } return false; } /** * A thread safe variation of {@link #renamePath(FileSystem, Path, Path)} which can be used in * multi-threaded/multi-mapper environment. The rename operation always happens at file level hence directories are * not overwritten under the 'to' path. * * <p> * If the contents of destination 'to' path is not expected to be modified concurrently, use * {@link #renamePath(FileSystem, Path, Path)} which is faster and more optimized * </p> * * <b>NOTE: This does not seem to be working for all {@link FileSystem} implementations. Use * {@link #renameRecursively(FileSystem, Path, Path)}</b> * * @param fileSystem on which the data needs to be moved * @param from path of the data to be moved * @param to path of the data to be moved * */ public static void safeRenameRecursively(FileSystem fileSystem, Path from, Path to) throws IOException { for (FileStatus fromFile : FileListUtils.listFilesRecursively(fileSystem, from)) { Path relativeFilePath = new Path(StringUtils.substringAfter(fromFile.getPath().toString(), from.toString() + Path.SEPARATOR)); Path toFilePath = new Path(to, relativeFilePath); if (!fileSystem.exists(toFilePath)) { boolean renamed = false; // underlying file open can fail with file not found error due to some race condition // when the parent directory is created in another thread, so retry a few times for (int i = 0; !renamed && i < MAX_RENAME_TRIES; i++) { try { renamed = fileSystem.rename(fromFile.getPath(), toFilePath); break; } catch (FileNotFoundException e) { if (i + 1 >= MAX_RENAME_TRIES) { throw e; } } } if (!renamed) { throw new IOException(String.format("Failed to rename %s to %s.", fromFile.getPath(), toFilePath)); } log.info(String.format("Renamed %s to %s", fromFile.getPath(), toFilePath)); } else { log.info(String.format("File already exists %s. Will not rewrite", toFilePath)); } } } public static Configuration getConfFromState(State state) { return getConfFromState(state, Optional.<String> absent()); } /** * Provides Hadoop configuration given state. * It also supports decrypting values on "encryptedPath". * Note that this encryptedPath path will be removed from full path of each config key and leaving only child path on the key(s). * If there's same config path as child path, the one stripped will have higher priority. * * e.g: * - encryptedPath: writer.fs.encrypted * before: writer.fs.encrypted.secret * after: secret * * Common use case for these encryptedPath: * When there's have encrypted credential in job property but you'd like Filesystem to get decrypted value. * * @param srcConfig source config. * @param encryptedPath Optional. If provided, config that is on this path will be decrypted. @see ConfigUtils.resolveEncrypted * Note that config on encryptedPath will be included in the end result even it's not part of includeOnlyPath * @return Hadoop Configuration. */ public static Configuration getConfFromState(State state, Optional<String> encryptedPath) { Config config = ConfigFactory.parseProperties(state.getProperties()); if (encryptedPath.isPresent()) { config = ConfigUtils.resolveEncrypted(config, encryptedPath); } Configuration conf = newConfiguration(); for (Entry<String, ConfigValue> entry : config.entrySet()) { conf.set(entry.getKey(), entry.getValue().unwrapped().toString()); } return conf; } public static Configuration getConfFromProperties(Properties properties) { Configuration conf = newConfiguration(); for (String propName : properties.stringPropertyNames()) { conf.set(propName, properties.getProperty(propName)); } return conf; } public static State getStateFromConf(Configuration conf) { State state = new State(); for (Entry<String, String> entry : conf) { state.setProp(entry.getKey(), entry.getValue()); } return state; } /** * Set the group associated with a given path. * * @param fs the {@link FileSystem} instance used to perform the file operation * @param path the given path * @param group the group associated with the path * @throws IOException */ public static void setGroup(FileSystem fs, Path path, String group) throws IOException { fs.setOwner(path, fs.getFileStatus(path).getOwner(), group); } /** * Serialize a {@link Writable} object into a string. * * @param writable the {@link Writable} object to be serialized * @return a string serialized from the {@link Writable} object * @throws IOException if there's something wrong with the serialization */ public static String serializeToString(Writable writable) throws IOException { try (ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); DataOutputStream dataOutputStream = new DataOutputStream(byteArrayOutputStream)) { writable.write(dataOutputStream); return BaseEncoding.base64().encode(byteArrayOutputStream.toByteArray()); } } /** * Deserialize a {@link Writable} object from a string. * * @param writableClass the {@link Writable} implementation class * @param serializedWritableStr the string containing a serialized {@link Writable} object * @return a {@link Writable} deserialized from the string * @throws IOException if there's something wrong with the deserialization */ public static Writable deserializeFromString(Class<? extends Writable> writableClass, String serializedWritableStr) throws IOException { return deserializeFromString(writableClass, serializedWritableStr, new Configuration()); } /** * Deserialize a {@link Writable} object from a string. * * @param writableClass the {@link Writable} implementation class * @param serializedWritableStr the string containing a serialized {@link Writable} object * @param configuration a {@link Configuration} object containing Hadoop configuration properties * @return a {@link Writable} deserialized from the string * @throws IOException if there's something wrong with the deserialization */ public static Writable deserializeFromString(Class<? extends Writable> writableClass, String serializedWritableStr, Configuration configuration) throws IOException { byte[] writableBytes = BaseEncoding.base64().decode(serializedWritableStr); try (ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(writableBytes); DataInputStream dataInputStream = new DataInputStream(byteArrayInputStream)) { Writable writable = ReflectionUtils.newInstance(writableClass, configuration); writable.readFields(dataInputStream); return writable; } } /** * Given a {@link FsPermission} objects, set a key, value pair in the given {@link State} for the writer to * use when creating files. This method should be used in conjunction with {@link #deserializeWriterFilePermissions(State, int, int)}. */ public static void serializeWriterFilePermissions(State state, int numBranches, int branchId, FsPermission fsPermissions) { serializeFsPermissions(state, ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_FILE_PERMISSIONS, numBranches, branchId), fsPermissions); } /** * Given a {@link FsPermission} objects, set a key, value pair in the given {@link State} for the writer to * use when creating files. This method should be used in conjunction with {@link #deserializeWriterDirPermissions(State, int, int)}. */ public static void serializeWriterDirPermissions(State state, int numBranches, int branchId, FsPermission fsPermissions) { serializeFsPermissions(state, ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_DIR_PERMISSIONS, numBranches, branchId), fsPermissions); } /** * Helper method that serializes a {@link FsPermission} object. */ private static void serializeFsPermissions(State state, String key, FsPermission fsPermissions) { state.setProp(key, String.format("%04o", fsPermissions.toShort())); } /** * Given a {@link String} in octal notation, set a key, value pair in the given {@link State} for the writer to * use when creating files. This method should be used in conjunction with {@link #deserializeWriterFilePermissions(State, int, int)}. */ public static void setWriterFileOctalPermissions(State state, int numBranches, int branchId, String octalPermissions) { state.setProp( ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_FILE_PERMISSIONS, numBranches, branchId), octalPermissions); } /** * Given a {@link String} in octal notation, set a key, value pair in the given {@link State} for the writer to * use when creating directories. This method should be used in conjunction with {@link #deserializeWriterDirPermissions(State, int, int)}. */ public static void setWriterDirOctalPermissions(State state, int numBranches, int branchId, String octalPermissions) { state.setProp( ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_DIR_PERMISSIONS, numBranches, branchId), octalPermissions); } /** * Deserializes a {@link FsPermission}s object that should be used when a {@link DataWriter} is writing a file. */ public static FsPermission deserializeWriterFilePermissions(State state, int numBranches, int branchId) { return new FsPermission(state.getPropAsShortWithRadix( ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_FILE_PERMISSIONS, numBranches, branchId), FsPermission.getDefault().toShort(), ConfigurationKeys.PERMISSION_PARSING_RADIX)); } /** * Deserializes a {@link FsPermission}s object that should be used when a {@link DataWriter} is creating directories. */ public static FsPermission deserializeWriterDirPermissions(State state, int numBranches, int branchId) { return new FsPermission(state.getPropAsShortWithRadix( ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_DIR_PERMISSIONS, numBranches, branchId), FsPermission.getDefault().toShort(), ConfigurationKeys.PERMISSION_PARSING_RADIX)); } /** * Get {@link FsPermission} from a {@link State} object. * * @param props A {@link State} containing properties. * @param propName The property name for the permission. If not contained in the given state, * defaultPermission will be used. * @param defaultPermission default permission if propName is not contained in props. * @return An {@link FsPermission} object. */ public static FsPermission deserializeFsPermission(State props, String propName, FsPermission defaultPermission) { short mode = props.getPropAsShortWithRadix(propName, defaultPermission.toShort(), ConfigurationKeys.PERMISSION_PARSING_RADIX); return new FsPermission(mode); } /** * Remove illegal HDFS path characters from the given path. Illegal characters will be replaced * with the given substitute. */ public static String sanitizePath(String path, String substitute) { Preconditions.checkArgument(substitute.replaceAll(HDFS_ILLEGAL_TOKEN_REGEX, "").equals(substitute), "substitute contains illegal characters: " + substitute); return path.replaceAll(HDFS_ILLEGAL_TOKEN_REGEX, substitute); } /** * Remove illegal HDFS path characters from the given path. Illegal characters will be replaced * with the given substitute. */ public static Path sanitizePath(Path path, String substitute) { return new Path(sanitizePath(path.toString(), substitute)); } /** * Try to set owner and permissions for the path. Will not throw exception. */ public static void setPermissions(Path location, Optional<String> owner, Optional<String> group, FileSystem fs, FsPermission permission) { try { if (!owner.isPresent()) { return; } if (!group.isPresent()) { return; } fs.setOwner(location, owner.get(), group.get()); fs.setPermission(location, permission); if (!fs.isDirectory(location)) { return; } for (FileStatus fileStatus : fs.listStatus(location)) { setPermissions(fileStatus.getPath(), owner, group, fs, permission); } } catch (IOException e) { log.warn("Exception occurred while trying to change permissions : " + e.getMessage()); } } public static boolean hasContent(FileSystem fs, Path path) throws IOException { if (!fs.isDirectory(path)) { return true; } boolean content = false; for (FileStatus fileStatus : fs.listStatus(path)) { content = content || hasContent(fs, fileStatus.getPath()); if (content) { break; } } return content; } /** * Add "gobblin-site.xml" as a {@link Configuration} resource. */ public static void addGobblinSite() { Configuration.addDefaultResource("gobblin-site.xml"); } }