/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package gobblin.data.management.copy.hive; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.metadata.Partition; import com.google.common.base.Optional; import com.google.common.base.Preconditions; import gobblin.util.PathUtils; public class HiveTargetPathHelper { /** * Specifies a root path for the data in a table. All files containing table data will be placed under this directory. * <p> * Does some token replacement in the input path. For example, if the table myTable is in DB myDatabase: * /data/$DB/$TABLE -> /data/myDatabase/myTable. * /data/$TABLE -> /data/myTable * /data -> /data/myTable * </p> * * See javadoc for {@link #getTargetPath} for further explanation. */ public static final String COPY_TARGET_TABLE_ROOT = HiveDatasetFinder.HIVE_DATASET_PREFIX + ".copy.target.table.root"; /** * These two options, in pair, specify the output location of the data files on copy * {@link #COPY_TARGET_TABLE_PREFIX_TOBE_REPLACED} specified the prefix of the path (without Scheme and Authority ) to be replaced * {@link #COPY_TARGET_TABLE_PREFIX_REPLACEMENT} specified the replacement of {@link #COPY_TARGET_TABLE_PREFIX_TOBE_REPLACED} * <p> * for example, if the data file is $sourceFs/data/databases/DB/Table/Snapshot/part-00000.avro , * {@link #COPY_TARGET_TABLE_PREFIX_TOBE_REPLACED} is /data/databases * {@link #COPY_TARGET_TABLE_PREFIX_REPLACEMENT} is /data/databases/_parallel * * then, the output location for that file will be * $targetFs/data/databases/_parallel/DB/Table/Snapshot/part-00000.avro * </p> */ public static final String COPY_TARGET_TABLE_PREFIX_TOBE_REPLACED = HiveDatasetFinder.HIVE_DATASET_PREFIX + ".copy.target.table.prefixToBeReplaced"; public static final String COPY_TARGET_TABLE_PREFIX_REPLACEMENT = HiveDatasetFinder.HIVE_DATASET_PREFIX + ".copy.target.table.prefixReplacement"; /** * Specifies that, on copy, data files for this table should all be relocated to a single directory per partition. * See javadoc for {@link #getTargetPath} for further explanation. */ public static final String RELOCATE_DATA_FILES_KEY = HiveDatasetFinder.HIVE_DATASET_PREFIX + ".copy.relocate.data.files"; public static final String DEFAULT_RELOCATE_DATA_FILES = Boolean.toString(false); private final boolean relocateDataFiles; private final Optional<Path> targetTableRoot; private final Optional<Path> targetTablePrefixTobeReplaced; private final Optional<Path> targetTablePrefixReplacement; private final HiveDataset dataset; public HiveTargetPathHelper(HiveDataset dataset) { this.dataset = dataset; this.relocateDataFiles = Boolean .valueOf(this.dataset.getProperties().getProperty(RELOCATE_DATA_FILES_KEY, DEFAULT_RELOCATE_DATA_FILES)); this.targetTableRoot = this.dataset.getProperties().containsKey(COPY_TARGET_TABLE_ROOT) ? Optional.of(resolvePath(this.dataset.getProperties().getProperty(COPY_TARGET_TABLE_ROOT), this.dataset.getTable().getDbName(), this.dataset.getTable().getTableName())) : Optional.<Path> absent(); this.targetTablePrefixTobeReplaced = this.dataset.getProperties().containsKey(COPY_TARGET_TABLE_PREFIX_TOBE_REPLACED) ? Optional.of(new Path(this.dataset.getProperties().getProperty(COPY_TARGET_TABLE_PREFIX_TOBE_REPLACED))) : Optional.<Path> absent(); this.targetTablePrefixReplacement = this.dataset.getProperties().containsKey(COPY_TARGET_TABLE_PREFIX_REPLACEMENT) ? Optional.of(new Path(this.dataset.getProperties().getProperty(COPY_TARGET_TABLE_PREFIX_REPLACEMENT))) : Optional.<Path> absent(); } private static Path addPartitionToPath(Path path, Partition partition) { for (String partitionValue : partition.getValues()) { path = new Path(path, partitionValue); } return path; } /** * Takes a path with tokens {@link #databaseToken} or {@link #tableToken} and replaces these tokens with the actual * database names and table name. For example, if db is myDatabase, table is myTable, then /data/$DB/$TABLE will be * resolved to /data/myDatabase/myTable. */ protected static Path resolvePath(String pattern, String database, String table) { pattern = pattern.replace(HiveDataset.DATABASE_TOKEN, database); if (pattern.contains(HiveDataset.TABLE_TOKEN)) { pattern = pattern.replace(HiveDataset.TABLE_TOKEN, table); return new Path(pattern); } else { return new Path(pattern, table); } } /** * Compute the target {@link Path} for a file or directory copied by Hive distcp. * * <p> * The target locations of data files for this table depend on the values of the resolved table root (e.g. * the value of {@link #COPY_TARGET_TABLE_ROOT} with tokens replaced) and {@link #RELOCATE_DATA_FILES_KEY}: * * if {@link #RELOCATE_DATA_FILES_KEY} is true, then origin file /path/to/file/myFile will be written to * /resolved/table/root/<partition>/myFile * * if {@link #COPY_TARGET_TABLE_PREFIX_TOBE_REPLACED} and {@link #COPY_TARGET_TABLE_PREFIX_REPLACEMENT} are defined, * then the specified prefix in each file will be replaced by the specified replacement. * * otherwise, if the resolved table root is defined (e.g. {@link #COPY_TARGET_TABLE_ROOT} is defined in the * properties), we define: * origin_table_root := the deepest non glob ancestor of table.getSc().getLocation() iff getLocation() points to * a single glob. (e.g. /path/to/*/files -> /path/to). If getLocation() contains none * or multiple globs, job will fail. * relative_path := path of the file relative to origin_table_root. If the path of the file is not a descendant * of origin_table_root, job will fail. * target_path := /resolved/table/root/relative/path * This mode is useful when moving a table with a complicated directory structure to a different base directory. * * otherwise the target is identical to the origin path. * </p> * * * @param sourcePath Source path to be transformed. * @param targetFs target {@link FileSystem} * @param partition partition this file belongs to. * @param isConcreteFile true if this is a path to an existing file in HDFS. */ public Path getTargetPath(Path sourcePath, FileSystem targetFs, Optional<Partition> partition, boolean isConcreteFile) { if (this.relocateDataFiles) { Preconditions.checkArgument(this.targetTableRoot.isPresent(), "Must define %s to relocate data files.", COPY_TARGET_TABLE_ROOT); Path path = this.targetTableRoot.get(); if (partition.isPresent()) { path = addPartitionToPath(path, partition.get()); } if (!isConcreteFile) { return targetFs.makeQualified(path); } return targetFs.makeQualified(new Path(path, sourcePath.getName())); } // both prefixs must be present as the same time // can not used with option {@link #COPY_TARGET_TABLE_ROOT} if (this.targetTablePrefixTobeReplaced.isPresent() || this.targetTablePrefixReplacement.isPresent()) { Preconditions.checkState(this.targetTablePrefixTobeReplaced.isPresent(), String.format("Must specify both %s option and %s option together", COPY_TARGET_TABLE_PREFIX_TOBE_REPLACED, COPY_TARGET_TABLE_PREFIX_REPLACEMENT)); Preconditions.checkState(this.targetTablePrefixReplacement.isPresent(), String.format("Must specify both %s option and %s option together", COPY_TARGET_TABLE_PREFIX_TOBE_REPLACED, COPY_TARGET_TABLE_PREFIX_REPLACEMENT)); Preconditions.checkState(!this.targetTableRoot.isPresent(), String.format("Can not specify the option %s with option %s ", COPY_TARGET_TABLE_ROOT, COPY_TARGET_TABLE_PREFIX_REPLACEMENT)); Path targetPathWithoutSchemeAndAuthority = HiveCopyEntityHelper.replacedPrefix(sourcePath, this.targetTablePrefixTobeReplaced.get(), this.targetTablePrefixReplacement.get()); return targetFs.makeQualified(targetPathWithoutSchemeAndAuthority); } else if (this.targetTableRoot.isPresent()) { Preconditions.checkArgument(this.dataset.getTableRootPath().isPresent(), "Cannot move paths to a new root unless table has exactly one location."); Preconditions.checkArgument(PathUtils.isAncestor(this.dataset.getTableRootPath().get(), sourcePath), "When moving paths to a new root, all locations must be descendants of the table root location. " + "Table root location: %s, file location: %s.", this.dataset.getTableRootPath(), sourcePath); Path relativePath = PathUtils.relativizePath(sourcePath, this.dataset.getTableRootPath().get()); return targetFs.makeQualified(new Path(this.targetTableRoot.get(), relativePath)); } else { return targetFs.makeQualified(PathUtils.getPathWithoutSchemeAndAuthority(sourcePath)); } } }