/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package gobblin.util; import java.io.IOException; import java.net.URI; import java.util.concurrent.ExecutionException; import org.apache.avro.file.CodecFactory; import org.apache.avro.file.DataFileConstants; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.security.token.Token; import com.google.common.base.Optional; import com.google.common.base.Preconditions; import com.google.common.base.Strings; import lombok.extern.slf4j.Slf4j; import gobblin.configuration.ConfigurationKeys; import gobblin.configuration.State; import gobblin.configuration.WorkUnitState; import gobblin.source.workunit.WorkUnit; /** * Utility class for use with the {@link gobblin.writer.DataWriter} class. */ @Slf4j public class WriterUtils { public static final String WRITER_ENCRYPTED_CONFIG_PATH = ConfigurationKeys.WRITER_PREFIX + ".encrypted"; /** * TABLENAME should be used for jobs that pull from multiple tables/topics and intend to write the records * in each table/topic to a separate folder. Otherwise, DEFAULT can be used. */ public enum WriterFilePathType { TABLENAME, DEFAULT } /** * Get the {@link Path} corresponding the to the directory a given {@link gobblin.writer.DataWriter} should be writing * its staging data. The staging data directory is determined by combining the * {@link ConfigurationKeys#WRITER_STAGING_DIR} and the {@link ConfigurationKeys#WRITER_FILE_PATH}. * @param state is the {@link State} corresponding to a specific {@link gobblin.writer.DataWriter}. * @param numBranches is the total number of branches for the given {@link State}. * @param branchId is the id for the specific branch that the {@link gobblin.writer.DataWriter} will write to. * @return a {@link Path} specifying the directory where the {@link gobblin.writer.DataWriter} will write to. */ public static Path getWriterStagingDir(State state, int numBranches, int branchId) { String writerStagingDirKey = ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_STAGING_DIR, numBranches, branchId); Preconditions.checkArgument(state.contains(writerStagingDirKey), "Missing required property " + writerStagingDirKey); return new Path( state.getProp( ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_STAGING_DIR, numBranches, branchId)), WriterUtils.getWriterFilePath(state, numBranches, branchId)); } /** * Get the staging {@link Path} for {@link gobblin.writer.DataWriter} that has attemptId in the path. */ public static Path getWriterStagingDir(State state, int numBranches, int branchId, String attemptId) { Preconditions.checkArgument(attemptId != null && !attemptId.isEmpty(), "AttemptId cannot be null or empty: " + attemptId); return new Path(getWriterStagingDir(state, numBranches, branchId), attemptId); } /** * Get the {@link Path} corresponding the to the directory a given {@link gobblin.writer.DataWriter} should be writing * its output data. The output data directory is determined by combining the * {@link ConfigurationKeys#WRITER_OUTPUT_DIR} and the {@link ConfigurationKeys#WRITER_FILE_PATH}. * @param state is the {@link State} corresponding to a specific {@link gobblin.writer.DataWriter}. * @param numBranches is the total number of branches for the given {@link State}. * @param branchId is the id for the specific branch that the {@link gobblin.writer.DataWriter} will write to. * @return a {@link Path} specifying the directory where the {@link gobblin.writer.DataWriter} will write to. */ public static Path getWriterOutputDir(State state, int numBranches, int branchId) { String writerOutputDirKey = ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_OUTPUT_DIR, numBranches, branchId); Preconditions.checkArgument(state.contains(writerOutputDirKey), "Missing required property " + writerOutputDirKey); return new Path(state.getProp(writerOutputDirKey), WriterUtils.getWriterFilePath(state, numBranches, branchId)); } /** * Get the {@link Path} corresponding the to the directory a given {@link gobblin.publisher.BaseDataPublisher} should * commits its output data. The final output data directory is determined by combining the * {@link ConfigurationKeys#DATA_PUBLISHER_FINAL_DIR} and the {@link ConfigurationKeys#WRITER_FILE_PATH}. * @param state is the {@link State} corresponding to a specific {@link gobblin.writer.DataWriter}. * @param numBranches is the total number of branches for the given {@link State}. * @param branchId is the id for the specific branch that the {@link gobblin.publisher.BaseDataPublisher} will publish. * @return a {@link Path} specifying the directory where the {@link gobblin.publisher.BaseDataPublisher} will publish. */ public static Path getDataPublisherFinalDir(State state, int numBranches, int branchId) { String dataPublisherFinalDirKey = ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, numBranches, branchId); Preconditions.checkArgument(state.contains(dataPublisherFinalDirKey), "Missing required property " + dataPublisherFinalDirKey); if (state.getPropAsBoolean(ConfigurationKeys.DATA_PUBLISHER_APPEND_EXTRACT_TO_FINAL_DIR, ConfigurationKeys.DEFAULT_DATA_PUBLISHER_APPEND_EXTRACT_TO_FINAL_DIR)) { return new Path(state.getProp( ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, numBranches, branchId)), WriterUtils.getWriterFilePath(state, numBranches, branchId)); } else { return new Path(state.getProp( ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, numBranches, branchId))); } } /** * Get the {@link Path} corresponding the the relative file path for a given {@link gobblin.writer.DataWriter}. * This method retrieves the value of {@link ConfigurationKeys#WRITER_FILE_PATH} from the given {@link State}. It also * constructs the default value of the {@link ConfigurationKeys#WRITER_FILE_PATH} if not is not specified in the given * {@link State}. * @param state is the {@link State} corresponding to a specific {@link gobblin.writer.DataWriter}. * @param numBranches is the total number of branches for the given {@link State}. * @param branchId is the id for the specific branch that the {{@link gobblin.writer.DataWriter} will write to. * @return a {@link Path} specifying the relative directory where the {@link gobblin.writer.DataWriter} will write to. */ public static Path getWriterFilePath(State state, int numBranches, int branchId) { if (state.contains( ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_FILE_PATH, numBranches, branchId))) { return new Path(state.getProp( ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_FILE_PATH, numBranches, branchId))); } switch (getWriterFilePathType(state)) { case TABLENAME: return WriterUtils.getTableNameWriterFilePath(state); default: return WriterUtils.getDefaultWriterFilePath(state, numBranches, branchId); } } private static WriterFilePathType getWriterFilePathType(State state) { String pathTypeStr = state.getProp(ConfigurationKeys.WRITER_FILE_PATH_TYPE, ConfigurationKeys.DEFAULT_WRITER_FILE_PATH_TYPE); return WriterFilePathType.valueOf(pathTypeStr.toUpperCase()); } /** * Creates {@link Path} for the {@link ConfigurationKeys#WRITER_FILE_PATH} key according to * {@link ConfigurationKeys#EXTRACT_TABLE_NAME_KEY}. * @param state * @return */ public static Path getTableNameWriterFilePath(State state) { Preconditions.checkArgument(state.contains(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY)); return new Path(state.getProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY)); } /** * Creates the default {@link Path} for the {@link ConfigurationKeys#WRITER_FILE_PATH} key. * @param numBranches is the total number of branches for the given {@link State}. * @param branchId is the id for the specific branch that the {@link gobblin.writer.DataWriter} will write to. * @return a {@link Path} specifying the directory where the {@link gobblin.writer.DataWriter} will write to. */ public static Path getDefaultWriterFilePath(State state, int numBranches, int branchId) { if (state instanceof WorkUnitState) { WorkUnitState workUnitState = (WorkUnitState) state; return new Path(ForkOperatorUtils.getPathForBranch(workUnitState, workUnitState.getExtract().getOutputFilePath(), numBranches, branchId)); } else if (state instanceof WorkUnit) { WorkUnit workUnit = (WorkUnit) state; return new Path(ForkOperatorUtils.getPathForBranch(workUnit, workUnit.getExtract().getOutputFilePath(), numBranches, branchId)); } throw new RuntimeException("In order to get the default value for " + ConfigurationKeys.WRITER_FILE_PATH + " the given state must be of type " + WorkUnitState.class.getName() + " or " + WorkUnit.class.getName()); } /** * Get the value of {@link ConfigurationKeys#WRITER_FILE_NAME} for the a given {@link gobblin.writer.DataWriter}. The * method also constructs the default value of the {@link ConfigurationKeys#WRITER_FILE_NAME} if it is not set in the * {@link State} * @param state is the {@link State} corresponding to a specific {@link gobblin.writer.DataWriter}. * @param numBranches is the total number of branches for the given {@link State}. * @param branchId is the id for the specific branch that the {{@link gobblin.writer.DataWriter} will write to. * @param writerId is the id for a specific {@link gobblin.writer.DataWriter}. * @param formatExtension is the format extension for the file (e.g. ".avro"). * @return a {@link String} representation of the file name. */ public static String getWriterFileName(State state, int numBranches, int branchId, String writerId, String formatExtension) { String defaultFileName = Strings.isNullOrEmpty(formatExtension) ? String.format("%s.%s", ConfigurationKeys.DEFAULT_WRITER_FILE_BASE_NAME, writerId) : String.format("%s.%s.%s", ConfigurationKeys.DEFAULT_WRITER_FILE_BASE_NAME, writerId, formatExtension); return state.getProp( ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_FILE_NAME, numBranches, branchId), defaultFileName); } /** * Creates a {@link CodecFactory} based on the specified codec name and deflate level. If codecName is absent, then * a {@link CodecFactory#deflateCodec(int)} is returned. Otherwise the codecName is converted into a * {@link CodecFactory} via the {@link CodecFactory#fromString(String)} method. * * @param codecName the name of the codec to use (e.g. deflate, snappy, xz, etc.). * @param deflateLevel must be an integer from [0-9], and is only applicable if the codecName is "deflate". * @return a {@link CodecFactory}. */ public static CodecFactory getCodecFactory(Optional<String> codecName, Optional<String> deflateLevel) { if (!codecName.isPresent()) { return CodecFactory.deflateCodec(ConfigurationKeys.DEFAULT_DEFLATE_LEVEL); } else if (codecName.get().equalsIgnoreCase(DataFileConstants.DEFLATE_CODEC)) { if (!deflateLevel.isPresent()) { return CodecFactory.deflateCodec(ConfigurationKeys.DEFAULT_DEFLATE_LEVEL); } return CodecFactory.deflateCodec(Integer.parseInt(deflateLevel.get())); } else { return CodecFactory.fromString(codecName.get().toLowerCase()); } } /** * Create the given dir as well as all missing ancestor dirs. All created dirs will have the given permission. * This should be used instead of {@link FileSystem#mkdirs(Path, FsPermission)}, since that method only sets * the permission for the given dir, and not recursively for the ancestor dirs. * * @param fs FileSystem * @param path The dir to be created * @param perm The permission to be set * @throws IOException if failing to create dir or set permission. */ public static void mkdirsWithRecursivePermission(FileSystem fs, Path path, FsPermission perm) throws IOException { if (fs.exists(path)) { return; } if (path.getParent() != null && !fs.exists(path.getParent())) { mkdirsWithRecursivePermission(fs, path.getParent(), perm); } if (!fs.mkdirs(path, perm)) { throw new IOException(String.format("Unable to mkdir %s with permission %s", path, perm)); } // Double check permission, since fs.mkdirs() may not guarantee to set the permission correctly if (!fs.getFileStatus(path).getPermission().equals(perm)) { fs.setPermission(path, perm); } } public static FileSystem getWriterFS(State state, int numBranches, int branchId) throws IOException { URI uri = URI.create(state.getProp( ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_FILE_SYSTEM_URI, numBranches, branchId), ConfigurationKeys.LOCAL_FS_URI)); Configuration hadoopConf = getFsConfiguration(state); if (state.getPropAsBoolean(ConfigurationKeys.SHOULD_FS_PROXY_AS_USER, ConfigurationKeys.DEFAULT_SHOULD_FS_PROXY_AS_USER)) { // Initialize file system for a proxy user. String authMethod = state.getProp(ConfigurationKeys.FS_PROXY_AUTH_METHOD, ConfigurationKeys.DEFAULT_FS_PROXY_AUTH_METHOD); if (authMethod.equalsIgnoreCase(ConfigurationKeys.TOKEN_AUTH)) { return getWriterFsUsingToken(state, uri); } else if (authMethod.equalsIgnoreCase(ConfigurationKeys.KERBEROS_AUTH)) { return getWriterFsUsingKeytab(state, uri); } } // Initialize file system as the current user. return FileSystem.get(uri, hadoopConf); } public static FileSystem getWriterFs(State state) throws IOException { return getWriterFS(state, 1, 0); } private static FileSystem getWriterFsUsingToken(State state, URI uri) throws IOException { try { String user = state.getProp(ConfigurationKeys.FS_PROXY_AS_USER_NAME); Optional<Token<?>> token = ProxiedFileSystemUtils .getTokenFromSeqFile(user, new Path(state.getProp(ConfigurationKeys.FS_PROXY_AS_USER_TOKEN_FILE))); if (!token.isPresent()) { throw new IOException("No token found for user " + user); } return ProxiedFileSystemCache.fromToken().userNameToken(token.get()) .userNameToProxyAs(state.getProp(ConfigurationKeys.FS_PROXY_AS_USER_NAME)).fsURI(uri) .conf(HadoopUtils.newConfiguration()).build(); } catch (ExecutionException e) { throw new IOException(e); } } private static FileSystem getWriterFsUsingKeytab(State state, URI uri) throws IOException { FileSystem fs = FileSystem.newInstance(uri, new Configuration()); try { Preconditions.checkArgument(state.contains(ConfigurationKeys.FS_PROXY_AS_USER_NAME), "Missing required property " + ConfigurationKeys.FS_PROXY_AS_USER_NAME); Preconditions.checkArgument(state.contains(ConfigurationKeys.SUPER_USER_NAME_TO_PROXY_AS_OTHERS), "Missing required property " + ConfigurationKeys.SUPER_USER_NAME_TO_PROXY_AS_OTHERS); Preconditions.checkArgument(state.contains(ConfigurationKeys.SUPER_USER_KEY_TAB_LOCATION), "Missing required property " + ConfigurationKeys.SUPER_USER_KEY_TAB_LOCATION); String user = state.getProp(ConfigurationKeys.FS_PROXY_AS_USER_NAME); String superUser = state.getProp(ConfigurationKeys.SUPER_USER_NAME_TO_PROXY_AS_OTHERS); Path keytabLocation = new Path(state.getProp(ConfigurationKeys.SUPER_USER_KEY_TAB_LOCATION)); return ProxiedFileSystemCache.fromKeytab().userNameToProxyAs(user).fsURI(uri) .superUserKeytabLocation(keytabLocation).superUserName(superUser).conf(HadoopUtils.newConfiguration()) .referenceFS(fs).build(); } catch (ExecutionException e) { throw new IOException(e); } } public static Configuration getFsConfiguration(State state) { return HadoopUtils.getConfFromState(state, Optional.of(WRITER_ENCRYPTED_CONFIG_PATH)); } }