/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.hive.hcatalog.mapreduce;
import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.hive.common.FileUtils;
import org.apache.hadoop.hive.common.StatsSetupConst;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.IMetaStoreClient;
import org.apache.hadoop.hive.metastore.MetaStoreUtils;
import org.apache.hadoop.hive.metastore.Warehouse;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.InvalidOperationException;
import org.apache.hadoop.hive.metastore.api.MetaException;
import org.apache.hadoop.hive.metastore.api.NoSuchObjectException;
import org.apache.hadoop.hive.metastore.api.Partition;
import org.apache.hadoop.hive.metastore.api.StorageDescriptor;
import org.apache.hadoop.hive.ql.metadata.HiveStorageHandler;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.shims.ShimLoader;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.JobStatus.State;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hive.hcatalog.common.ErrorType;
import org.apache.hive.hcatalog.common.HCatConstants;
import org.apache.hive.hcatalog.common.HCatException;
import org.apache.hive.hcatalog.common.HCatUtil;
import org.apache.hive.hcatalog.data.schema.HCatFieldSchema;
import org.apache.hive.hcatalog.data.schema.HCatSchema;
import org.apache.hive.hcatalog.data.schema.HCatSchemaUtils;
import org.apache.hive.hcatalog.har.HarOutputCommitterPostProcessor;
import org.apache.thrift.TException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Part of the FileOutput*Container classes
* See {@link FileOutputFormatContainer} for more information
*/
class FileOutputCommitterContainer extends OutputCommitterContainer {
private static final String TEMP_DIR_NAME = "_temporary";
private static final String LOGS_DIR_NAME = "_logs";
static final String DYNTEMP_DIR_NAME = "_DYN";
static final String SCRATCH_DIR_NAME = "_SCRATCH";
private static final String APPEND_SUFFIX = "_a_";
private static final int APPEND_COUNTER_WARN_THRESHOLD = 1000;
private final int maxAppendAttempts;
private static final Logger LOG = LoggerFactory.getLogger(FileOutputCommitterContainer.class);
private final boolean dynamicPartitioningUsed;
private boolean partitionsDiscovered;
private final boolean customDynamicLocationUsed;
private Map<String, Map<String, String>> partitionsDiscoveredByPath;
private Map<String, JobContext> contextDiscoveredByPath;
private final HiveStorageHandler cachedStorageHandler;
HarOutputCommitterPostProcessor harProcessor = new HarOutputCommitterPostProcessor();
private String ptnRootLocation = null;
private OutputJobInfo jobInfo = null;
/**
* @param context current JobContext
* @param baseCommitter OutputCommitter to contain
* @throws IOException
*/
public FileOutputCommitterContainer(JobContext context,
org.apache.hadoop.mapred.OutputCommitter baseCommitter) throws IOException {
super(context, baseCommitter);
jobInfo = HCatOutputFormat.getJobInfo(context.getConfiguration());
dynamicPartitioningUsed = jobInfo.isDynamicPartitioningUsed();
this.partitionsDiscovered = !dynamicPartitioningUsed;
cachedStorageHandler = HCatUtil.getStorageHandler(context.getConfiguration(), jobInfo.getTableInfo().getStorerInfo());
Table table = new Table(jobInfo.getTableInfo().getTable());
if (dynamicPartitioningUsed && Boolean.parseBoolean((String)table.getProperty("EXTERNAL"))
&& jobInfo.getCustomDynamicPath() != null
&& jobInfo.getCustomDynamicPath().length() > 0) {
customDynamicLocationUsed = true;
} else {
customDynamicLocationUsed = false;
}
this.maxAppendAttempts = context.getConfiguration().getInt(HCatConstants.HCAT_APPEND_LIMIT, APPEND_COUNTER_WARN_THRESHOLD);
}
@Override
public void abortTask(TaskAttemptContext context) throws IOException {
if (!dynamicPartitioningUsed) {
FileOutputFormatContainer.setWorkOutputPath(context);
getBaseOutputCommitter().abortTask(HCatMapRedUtil.createTaskAttemptContext(context));
} else {
try {
TaskCommitContextRegistry.getInstance().abortTask(context);
}
finally {
TaskCommitContextRegistry.getInstance().discardCleanupFor(context);
}
}
}
@Override
public void commitTask(TaskAttemptContext context) throws IOException {
if (!dynamicPartitioningUsed) {
//See HCATALOG-499
FileOutputFormatContainer.setWorkOutputPath(context);
getBaseOutputCommitter().commitTask(HCatMapRedUtil.createTaskAttemptContext(context));
} else {
try {
TaskCommitContextRegistry.getInstance().commitTask(context);
}
finally {
TaskCommitContextRegistry.getInstance().discardCleanupFor(context);
}
}
}
@Override
public boolean needsTaskCommit(TaskAttemptContext context) throws IOException {
if (!dynamicPartitioningUsed) {
FileOutputFormatContainer.setWorkOutputPath(context);
return getBaseOutputCommitter().needsTaskCommit(HCatMapRedUtil.createTaskAttemptContext(context));
} else {
// called explicitly through FileRecordWriterContainer.close() if dynamic - return false by default
return true;
}
}
@Override
public void setupJob(JobContext context) throws IOException {
if (getBaseOutputCommitter() != null && !dynamicPartitioningUsed) {
getBaseOutputCommitter().setupJob(HCatMapRedUtil.createJobContext(context));
}
// in dynamic usecase, called through FileRecordWriterContainer
}
@Override
public void setupTask(TaskAttemptContext context) throws IOException {
if (!dynamicPartitioningUsed) {
getBaseOutputCommitter().setupTask(HCatMapRedUtil.createTaskAttemptContext(context));
}
}
@Override
public void abortJob(JobContext jobContext, State state) throws IOException {
try {
if (dynamicPartitioningUsed) {
discoverPartitions(jobContext);
}
org.apache.hadoop.mapred.JobContext mapRedJobContext = HCatMapRedUtil
.createJobContext(jobContext);
if (getBaseOutputCommitter() != null && !dynamicPartitioningUsed) {
getBaseOutputCommitter().abortJob(mapRedJobContext, state);
} else if (dynamicPartitioningUsed) {
for (JobContext currContext : contextDiscoveredByPath.values()) {
try {
new JobConf(currContext.getConfiguration())
.getOutputCommitter().abortJob(currContext,
state);
} catch (Exception e) {
throw new IOException(e);
}
}
}
Path src;
OutputJobInfo jobInfo = HCatOutputFormat.getJobInfo(jobContext.getConfiguration());
Path tblPath = new Path(jobInfo.getTableInfo().getTableLocation());
if (dynamicPartitioningUsed) {
if (!customDynamicLocationUsed) {
src = new Path(getPartitionRootLocation(jobInfo.getLocation(), jobInfo.getTableInfo().getTable()
.getPartitionKeysSize()));
} else {
src = new Path(getCustomPartitionRootLocation(jobInfo, jobContext.getConfiguration()));
}
} else {
src = new Path(jobInfo.getLocation());
}
FileSystem fs = src.getFileSystem(jobContext.getConfiguration());
// Note fs.delete will fail on Windows. The reason is in OutputCommitter,
// Hadoop is still writing to _logs/history. On Linux, OS don't care file is still
// open and remove the directory anyway, but on Windows, OS refuse to remove a
// directory containing open files. So on Windows, we will leave output directory
// behind when job fail. User needs to remove the output directory manually
LOG.info("Job failed. Try cleaning up temporary directory [{}].", src);
if (!src.equals(tblPath)){
fs.delete(src, true);
}
} finally {
cancelDelegationTokens(jobContext);
}
}
public static final String SUCCEEDED_FILE_NAME = "_SUCCESS";
static final String SUCCESSFUL_JOB_OUTPUT_DIR_MARKER =
"mapreduce.fileoutputcommitter.marksuccessfuljobs";
private static boolean getOutputDirMarking(Configuration conf) {
return conf.getBoolean(SUCCESSFUL_JOB_OUTPUT_DIR_MARKER,
false);
}
@Override
public void commitJob(JobContext jobContext) throws IOException {
if (dynamicPartitioningUsed) {
discoverPartitions(jobContext);
// Commit each partition so it gets moved out of the job work
// dir
for (JobContext context : contextDiscoveredByPath.values()) {
new JobConf(context.getConfiguration())
.getOutputCommitter().commitJob(context);
}
}
if (getBaseOutputCommitter() != null && !dynamicPartitioningUsed) {
getBaseOutputCommitter().commitJob(
HCatMapRedUtil.createJobContext(jobContext));
}
registerPartitions(jobContext);
// create _SUCCESS FILE if so requested.
OutputJobInfo jobInfo = HCatOutputFormat.getJobInfo(jobContext.getConfiguration());
if (getOutputDirMarking(jobContext.getConfiguration())) {
Path outputPath = new Path(jobInfo.getLocation());
FileSystem fileSys = outputPath.getFileSystem(jobContext
.getConfiguration());
// create a file in the folder to mark it
if (fileSys.exists(outputPath)) {
Path filePath = new Path(outputPath,
SUCCEEDED_FILE_NAME);
if (!fileSys.exists(filePath)) { // may have been
// created by
// baseCommitter.commitJob()
fileSys.create(filePath).close();
}
}
}
// Commit has succeeded (since no exceptions have been thrown.)
// Safe to cancel delegation tokens now.
cancelDelegationTokens(jobContext);
}
@Override
public void cleanupJob(JobContext context) throws IOException {
throw new IOException("The method cleanupJob is deprecated and should not be called.");
}
private String getCustomPartitionRootLocation(OutputJobInfo jobInfo, Configuration conf) {
if (ptnRootLocation == null) {
// we only need to calculate it once, it'll be the same for other partitions in this job.
String parentPath = jobInfo.getTableInfo().getTableLocation();
if (jobInfo.getCustomDynamicRoot() != null
&& jobInfo.getCustomDynamicRoot().length() > 0) {
parentPath = new Path(parentPath, jobInfo.getCustomDynamicRoot()).toString();
}
Path ptnRoot = new Path(parentPath, DYNTEMP_DIR_NAME +
conf.get(HCatConstants.HCAT_DYNAMIC_PTN_JOBID));
ptnRootLocation = ptnRoot.toString();
}
return ptnRootLocation;
}
private String getPartitionRootLocation(String ptnLocn, int numPtnKeys) {
if (customDynamicLocationUsed) {
return null;
}
if (ptnRootLocation == null) {
// we only need to calculate it once, it'll be the same for other partitions in this job.
Path ptnRoot = new Path(ptnLocn);
for (int i = 0; i < numPtnKeys; i++) {
// LOG.info("Getting parent of "+ptnRoot.getName());
ptnRoot = ptnRoot.getParent();
}
ptnRootLocation = ptnRoot.toString();
}
// LOG.info("Returning final parent : "+ptnRootLocation);
return ptnRootLocation;
}
/**
* Generate partition metadata object to be used to add to metadata.
* @param context The job context.
* @param jobInfo The OutputJobInfo.
* @param partLocnRoot The table-equivalent location root of the partition
* (temporary dir if dynamic partition, table dir if static)
* @param dynPartPath The path of dynamic partition which is created
* @param partKVs The keyvalue pairs that form the partition
* @param outputSchema The output schema for the partition
* @param params The parameters to store inside the partition
* @param table The Table metadata object under which this Partition will reside
* @param fs FileSystem object to operate on the underlying filesystem
* @param grpName Group name that owns the table dir
* @param perms FsPermission that's the default permission of the table dir.
* @return Constructed Partition metadata object
* @throws java.io.IOException
*/
private Partition constructPartition(
JobContext context, OutputJobInfo jobInfo,
String partLocnRoot, String dynPartPath, Map<String, String> partKVs,
HCatSchema outputSchema, Map<String, String> params,
Table table, FileSystem fs,
String grpName, FsPermission perms) throws IOException {
Partition partition = new Partition();
partition.setDbName(table.getDbName());
partition.setTableName(table.getTableName());
partition.setSd(new StorageDescriptor(table.getTTable().getSd()));
List<FieldSchema> fields = new ArrayList<FieldSchema>();
for (HCatFieldSchema fieldSchema : outputSchema.getFields()) {
fields.add(HCatSchemaUtils.getFieldSchema(fieldSchema));
}
partition.getSd().setCols(fields);
partition.setValues(FileOutputFormatContainer.getPartitionValueList(table, partKVs));
partition.setParameters(params);
// Sets permissions and group name on partition dirs and files.
Path partPath;
if (customDynamicLocationUsed) {
partPath = new Path(dynPartPath);
} else if (!dynamicPartitioningUsed
&& Boolean.parseBoolean((String)table.getProperty("EXTERNAL"))
&& jobInfo.getLocation() != null && jobInfo.getLocation().length() > 0) {
// Now, we need to de-scratchify this location - i.e., get rid of any
// _SCRATCH[\d].?[\d]+ from the location.
String jobLocation = jobInfo.getLocation();
String finalLocn = jobLocation.replaceAll(Path.SEPARATOR + SCRATCH_DIR_NAME + "\\d\\.?\\d+","");
partPath = new Path(finalLocn);
} else {
partPath = new Path(partLocnRoot);
int i = 0;
for (FieldSchema partKey : table.getPartitionKeys()) {
if (i++ != 0) {
fs.mkdirs(partPath); // Attempt to make the path in case it does not exist before we check
applyGroupAndPerms(fs, partPath, perms, grpName, false);
}
partPath = constructPartialPartPath(partPath, partKey.getName().toLowerCase(), partKVs);
}
}
// Apply the group and permissions to the leaf partition and files.
// Need not bother in case of HDFS as permission is taken care of by setting UMask
fs.mkdirs(partPath); // Attempt to make the path in case it does not exist before we check
if (!ShimLoader.getHadoopShims().getHCatShim().isFileInHDFS(fs, partPath)) {
applyGroupAndPerms(fs, partPath, perms, grpName, true);
}
// Set the location in the StorageDescriptor
if (dynamicPartitioningUsed) {
String dynamicPartitionDestination = getFinalDynamicPartitionDestination(table, partKVs, jobInfo);
if (harProcessor.isEnabled()) {
harProcessor.exec(context, partition, partPath);
partition.getSd().setLocation(
harProcessor.getProcessedLocation(new Path(dynamicPartitionDestination)));
} else {
partition.getSd().setLocation(dynamicPartitionDestination);
}
} else {
partition.getSd().setLocation(partPath.toString());
}
return partition;
}
private void applyGroupAndPerms(FileSystem fs, Path dir, FsPermission permission,
String group, boolean recursive)
throws IOException {
if(LOG.isDebugEnabled()) {
LOG.debug("applyGroupAndPerms : " + dir +
" perms: " + permission +
" group: " + group + " recursive: " + recursive);
}
fs.setPermission(dir, permission);
if (recursive) {
for (FileStatus fileStatus : fs.listStatus(dir)) {
if (fileStatus.isDir()) {
applyGroupAndPerms(fs, fileStatus.getPath(), permission, group, true);
} else {
fs.setPermission(fileStatus.getPath(), permission);
}
}
}
}
private String getFinalDynamicPartitionDestination(Table table, Map<String, String> partKVs,
OutputJobInfo jobInfo) {
Path partPath = new Path(table.getTTable().getSd().getLocation());
if (!customDynamicLocationUsed) {
// file:///tmp/hcat_junit_warehouse/employee/_DYN0.7770480401313761/emp_country=IN/emp_state=KA ->
// file:///tmp/hcat_junit_warehouse/employee/emp_country=IN/emp_state=KA
for (FieldSchema partKey : table.getPartitionKeys()) {
partPath = constructPartialPartPath(partPath, partKey.getName().toLowerCase(), partKVs);
}
return partPath.toString();
} else {
// if custom root specified, update the parent path
if (jobInfo.getCustomDynamicRoot() != null
&& jobInfo.getCustomDynamicRoot().length() > 0) {
partPath = new Path(partPath, jobInfo.getCustomDynamicRoot());
}
return new Path(partPath, HCatFileUtil.resolveCustomPath(jobInfo, partKVs, false)).toString();
}
}
private Map<String, String> getStorerParameterMap(StorerInfo storer) {
Map<String, String> params = new HashMap<String, String>();
//Copy table level hcat.* keys to the partition
for (Entry<Object, Object> entry : storer.getProperties().entrySet()) {
if (!entry.getKey().toString().equals(StatsSetupConst.COLUMN_STATS_ACCURATE)) {
params.put(entry.getKey().toString(), entry.getValue().toString());
}
}
return params;
}
private Path constructPartialPartPath(Path partialPath, String partKey, Map<String, String> partKVs) {
StringBuilder sb = new StringBuilder(FileUtils.escapePathName(partKey));
sb.append("=");
sb.append(FileUtils.escapePathName(partKVs.get(partKey)));
return new Path(partialPath, sb.toString());
}
/**
* Update table schema, adding new columns as added for the partition.
* @param client the client
* @param table the table
* @param partitionSchema the schema of the partition
* @throws java.io.IOException Signals that an I/O exception has occurred.
* @throws org.apache.hadoop.hive.metastore.api.InvalidOperationException the invalid operation exception
* @throws org.apache.hadoop.hive.metastore.api.MetaException the meta exception
* @throws org.apache.thrift.TException the t exception
*/
private void updateTableSchema(IMetaStoreClient client, Table table,
HCatSchema partitionSchema) throws IOException, InvalidOperationException, MetaException, TException {
List<FieldSchema> newColumns = HCatUtil.validatePartitionSchema(table, partitionSchema);
if (newColumns.size() != 0) {
List<FieldSchema> tableColumns = new ArrayList<FieldSchema>(table.getTTable().getSd().getCols());
tableColumns.addAll(newColumns);
//Update table schema to add the newly added columns
table.getTTable().getSd().setCols(tableColumns);
client.alter_table(table.getDbName(), table.getTableName(), table.getTTable());
}
}
/**
* Move all of the files from the temp directory to the final location
* @param fs the output file system
* @param file the file to move
* @param srcDir the source directory
* @param destDir the target directory
* @param dryRun - a flag that simply tests if this move would succeed or not based
* on whether other files exist where we're trying to copy
* @throws java.io.IOException
*/
private void moveTaskOutputs(FileSystem fs, Path file, Path srcDir,
Path destDir, final boolean dryRun, boolean immutable
) throws IOException {
if(LOG.isDebugEnabled()) {
LOG.debug("moveTaskOutputs "
+ file + " from: " + srcDir + " to: " + destDir
+ " dry: " + dryRun + " immutable: " + immutable);
}
if (dynamicPartitioningUsed) {
immutable = true; // Making sure we treat dynamic partitioning jobs as if they were immutable.
}
if (file.getName().equals(TEMP_DIR_NAME) || file.getName().equals(LOGS_DIR_NAME) || file.getName().equals(SUCCEEDED_FILE_NAME)) {
return;
}
final Path finalOutputPath = getFinalPath(fs, file, srcDir, destDir, immutable);
FileStatus fileStatus = fs.getFileStatus(file);
if (!fileStatus.isDir()) {
if (dryRun){
if (immutable){
// Dryrun checks are meaningless for mutable table - we should always succeed
// unless there is a runtime IOException.
if(LOG.isDebugEnabled()) {
LOG.debug("Testing if moving file: [" + file + "] to ["
+ finalOutputPath + "] would cause a problem");
}
if (fs.exists(finalOutputPath)) {
throw new HCatException(ErrorType.ERROR_MOVE_FAILED, "Data already exists in "
+ finalOutputPath + ", duplicate publish not possible.");
}
}
} else {
if(LOG.isDebugEnabled()) {
LOG.debug("Moving file: [ " + file + "] to [" + finalOutputPath + "]");
}
// Make sure the parent directory exists. It is not an error
// to recreate an existing directory
fs.mkdirs(finalOutputPath.getParent());
if (!fs.rename(file, finalOutputPath)) {
if (!fs.delete(finalOutputPath, true)) {
throw new HCatException(ErrorType.ERROR_MOVE_FAILED, "Failed to delete existing path " + finalOutputPath);
}
if (!fs.rename(file, finalOutputPath)) {
throw new HCatException(ErrorType.ERROR_MOVE_FAILED, "Failed to move output to " + finalOutputPath);
}
}
}
} else {
FileStatus[] children = fs.listStatus(file);
FileStatus firstChild = null;
if (children != null) {
int index=0;
while (index < children.length) {
if ( !children[index].getPath().getName().equals(TEMP_DIR_NAME)
&& !children[index].getPath().getName().equals(LOGS_DIR_NAME)
&& !children[index].getPath().getName().equals(SUCCEEDED_FILE_NAME)) {
firstChild = children[index];
break;
}
index++;
}
}
if(firstChild!=null && firstChild.isDir()) {
// If the first child is directory, then rest would be directory too according to HCatalog dir structure
// recurse in that case
for (FileStatus child : children) {
moveTaskOutputs(fs, child.getPath(), srcDir, destDir, dryRun, immutable);
}
} else {
if (!dryRun) {
if (dynamicPartitioningUsed) {
// Optimization: if the first child is file, we have reached the leaf directory, move the parent directory itself
// instead of moving each file under the directory. See HCATALOG-538
// Note for future Append implementation : This optimization is another reason dynamic
// partitioning is currently incompatible with append on mutable tables.
final Path parentDir = finalOutputPath.getParent();
// Create the directory
Path placeholder = new Path(parentDir, "_placeholder");
if (fs.mkdirs(parentDir)) {
// It is weired but we need a placeholder,
// otherwise rename cannot move file to the right place
fs.create(placeholder).close();
}
if (LOG.isDebugEnabled()) {
LOG.debug("Moving directory: " + file + " to " + parentDir);
}
// If custom dynamic location provided, need to rename to final output path
Path dstPath = !customDynamicLocationUsed ? parentDir : finalOutputPath;
if (!fs.rename(file, dstPath)) {
final String msg = "Failed to move file: " + file + " to " + dstPath;
LOG.error(msg);
throw new HCatException(ErrorType.ERROR_MOVE_FAILED, msg);
}
fs.delete(placeholder, false);
} else {
// In case of no partition we have to move each file
for (FileStatus child : children) {
moveTaskOutputs(fs, child.getPath(), srcDir, destDir, dryRun, immutable);
}
}
} else {
if(immutable && fs.exists(finalOutputPath) && !MetaStoreUtils.isDirEmpty(fs, finalOutputPath)) {
throw new HCatException(ErrorType.ERROR_DUPLICATE_PARTITION, "Data already exists in " + finalOutputPath
+ ", duplicate publish not possible.");
}
}
}
}
}
/**
* Find the final name of a given output file, given the output directory
* and the work directory. If immutable, attempt to create file of name
* _aN till we find an item that does not exist.
* @param file the file to move
* @param src the source directory
* @param dest the target directory
* @return the final path for the specific output file
* @throws java.io.IOException
*/
private Path getFinalPath(FileSystem fs, Path file, Path src,
Path dest, final boolean immutable) throws IOException {
URI taskOutputUri = file.toUri();
URI relativePath = src.toUri().relativize(taskOutputUri);
if (taskOutputUri == relativePath) {
throw new HCatException(ErrorType.ERROR_MOVE_FAILED, "Can not get the relative path: base = " +
src + " child = " + file);
}
if (relativePath.getPath().length() > 0) {
Path itemDest = new Path(dest, relativePath.getPath());
if (!immutable){
String name = relativePath.getPath();
String filetype;
int index = name.lastIndexOf('.');
if (index >= 0) {
filetype = name.substring(index);
name = name.substring(0, index);
} else {
filetype = "";
}
// Attempt to find maxAppendAttempts possible alternatives to a filename by
// appending _a_N and seeing if that destination also clashes. If we're
// still clashing after that, give up.
int counter = 1;
for (; fs.exists(itemDest) && counter < maxAppendAttempts; counter++) {
itemDest = new Path(dest, name + (APPEND_SUFFIX + counter) + filetype);
}
if (counter == maxAppendAttempts){
throw new HCatException(ErrorType.ERROR_MOVE_FAILED,
"Could not find a unique destination path for move: file = "
+ file + " , src = " + src + ", dest = " + dest);
} else if (counter > APPEND_COUNTER_WARN_THRESHOLD) {
LOG.warn("Append job used filename clash counter [" + counter
+"] which is greater than warning limit [" + APPEND_COUNTER_WARN_THRESHOLD
+"]. Please compact this table so that performance is not impacted."
+ " Please see HIVE-9381 for details.");
}
}
if (LOG.isDebugEnabled()){
LOG.debug("FinalPath(file:"+file+":"+src+"->"+dest+"="+itemDest);
}
return itemDest;
} else {
return dest;
}
}
/**
* Run to discover dynamic partitions available
*/
private void discoverPartitions(JobContext context) throws IOException {
if (!partitionsDiscovered) {
// LOG.info("discover ptns called");
OutputJobInfo jobInfo = HCatOutputFormat.getJobInfo(context.getConfiguration());
harProcessor.setEnabled(jobInfo.getHarRequested());
List<Integer> dynamicPartCols = jobInfo.getPosOfDynPartCols();
int maxDynamicPartitions = jobInfo.getMaxDynamicPartitions();
Path loadPath = new Path(jobInfo.getLocation());
FileSystem fs = loadPath.getFileSystem(context.getConfiguration());
// construct a path pattern (e.g., /*/*) to find all dynamically generated paths
String dynPathSpec = loadPath.toUri().getPath();
dynPathSpec = dynPathSpec.replaceAll("__HIVE_DEFAULT_PARTITION__", "*");
// LOG.info("Searching for "+dynPathSpec);
Path pathPattern = new Path(dynPathSpec);
FileStatus[] status = fs.globStatus(pathPattern, FileUtils.HIDDEN_FILES_PATH_FILTER);
partitionsDiscoveredByPath = new LinkedHashMap<String, Map<String, String>>();
contextDiscoveredByPath = new LinkedHashMap<String, JobContext>();
if (status.length == 0) {
// LOG.warn("No partition found genereated by dynamic partitioning in ["
// +loadPath+"] with depth["+jobInfo.getTable().getPartitionKeysSize()
// +"], dynSpec["+dynPathSpec+"]");
} else {
if ((maxDynamicPartitions != -1) && (status.length > maxDynamicPartitions)) {
this.partitionsDiscovered = true;
throw new HCatException(ErrorType.ERROR_TOO_MANY_DYNAMIC_PTNS,
"Number of dynamic partitions being created "
+ "exceeds configured max allowable partitions["
+ maxDynamicPartitions
+ "], increase parameter ["
+ HiveConf.ConfVars.DYNAMICPARTITIONMAXPARTS.varname
+ "] if needed.");
}
for (FileStatus st : status) {
LinkedHashMap<String, String> fullPartSpec = new LinkedHashMap<String, String>();
if (!customDynamicLocationUsed) {
Warehouse.makeSpecFromName(fullPartSpec, st.getPath());
} else {
HCatFileUtil.getPartKeyValuesForCustomLocation(fullPartSpec, jobInfo,
st.getPath().toString());
}
partitionsDiscoveredByPath.put(st.getPath().toString(), fullPartSpec);
JobConf jobConf = (JobConf)context.getConfiguration();
JobContext currContext = HCatMapRedUtil.createJobContext(
jobConf,
context.getJobID(),
InternalUtil.createReporter(HCatMapRedUtil.createTaskAttemptContext(jobConf,
ShimLoader.getHadoopShims().getHCatShim().createTaskAttemptID())));
HCatOutputFormat.configureOutputStorageHandler(currContext, jobInfo, fullPartSpec);
contextDiscoveredByPath.put(st.getPath().toString(), currContext);
}
}
// for (Entry<String,Map<String,String>> spec : partitionsDiscoveredByPath.entrySet()){
// LOG.info("Partition "+ spec.getKey());
// for (Entry<String,String> e : spec.getValue().entrySet()){
// LOG.info(e.getKey() + "=>" +e.getValue());
// }
// }
this.partitionsDiscovered = true;
}
}
private void registerPartitions(JobContext context) throws IOException{
if (dynamicPartitioningUsed){
discoverPartitions(context);
}
OutputJobInfo jobInfo = HCatOutputFormat.getJobInfo(context.getConfiguration());
Configuration conf = context.getConfiguration();
Table table = new Table(jobInfo.getTableInfo().getTable());
Path tblPath = new Path(table.getTTable().getSd().getLocation());
FileSystem fs = tblPath.getFileSystem(conf);
IMetaStoreClient client = null;
HCatTableInfo tableInfo = jobInfo.getTableInfo();
List<Partition> partitionsAdded = new ArrayList<Partition>();
try {
HiveConf hiveConf = HCatUtil.getHiveConf(conf);
client = HCatUtil.getHiveMetastoreClient(hiveConf);
if (table.getPartitionKeys().size() == 0) {
// Move data from temp directory the actual table directory
// No metastore operation required.
Path src = new Path(jobInfo.getLocation());
moveTaskOutputs(fs, src, src, tblPath, false, table.isImmutable());
if (!src.equals(tblPath)) {
fs.delete(src, true);
}
if (table.getParameters() != null
&& table.getParameters().containsKey(StatsSetupConst.COLUMN_STATS_ACCURATE)) {
table.getParameters().remove(StatsSetupConst.COLUMN_STATS_ACCURATE);
client.alter_table(table.getDbName(), table.getTableName(), table.getTTable());
}
return;
}
StorerInfo storer = InternalUtil.extractStorerInfo(table.getTTable().getSd(),
table.getParameters());
FileStatus tblStat = fs.getFileStatus(tblPath);
String grpName = tblStat.getGroup();
FsPermission perms = tblStat.getPermission();
List<Partition> partitionsToAdd = new ArrayList<Partition>();
if (!dynamicPartitioningUsed) {
partitionsToAdd.add(constructPartition(context, jobInfo, tblPath.toString(), null,
jobInfo.getPartitionValues(), jobInfo.getOutputSchema(), getStorerParameterMap(storer),
table, fs, grpName, perms));
} else {
for (Entry<String, Map<String, String>> entry : partitionsDiscoveredByPath.entrySet()) {
partitionsToAdd.add(constructPartition(context, jobInfo,
getPartitionRootLocation(entry.getKey(), entry.getValue().size()), entry.getKey(),
entry.getValue(), jobInfo.getOutputSchema(), getStorerParameterMap(storer), table,
fs, grpName, perms));
}
}
ArrayList<Map<String,String>> ptnInfos = new ArrayList<Map<String,String>>();
for(Partition ptn : partitionsToAdd){
ptnInfos.add(InternalUtil.createPtnKeyValueMap(new Table(tableInfo.getTable()), ptn));
}
/**
* Dynamic partitioning & Append incompatibility note:
*
* Currently, we do not support mixing dynamic partitioning and append in the
* same job. One reason is that we need exhaustive testing of corner cases
* for that, and a second reason is the behaviour of add_partitions. To support
* dynamic partitioning with append, we'd have to have a add_partitions_if_not_exist
* call, rather than an add_partitions call. Thus far, we've tried to keep the
* implementation of append jobtype-agnostic, but here, in code, we assume that
* a table is considered immutable if dynamic partitioning is enabled on the job.
*
* This does not mean that we can check before the job begins that this is going
* to be a dynamic partition job on an immutable table and thus fail the job, since
* it is quite possible to have a dynamic partitioning job run on an unpopulated
* immutable table. It simply means that at the end of the job, as far as copying
* in data is concerned, we will pretend that the table is immutable irrespective
* of what table.isImmutable() tells us.
*/
//Publish the new partition(s)
if (dynamicPartitioningUsed && harProcessor.isEnabled() && (!partitionsToAdd.isEmpty())){
if (!customDynamicLocationUsed) {
Path src = new Path(ptnRootLocation);
// check here for each dir we're copying out, to see if it
// already exists, error out if so.
// Also, treat dyn-writes as writes to immutable tables.
moveTaskOutputs(fs, src, src, tblPath, true, true); // dryRun = true, immutable = true
moveTaskOutputs(fs, src, src, tblPath, false, true);
if (!src.equals(tblPath)){
fs.delete(src, true);
}
} else {
moveCustomLocationTaskOutputs(fs, table, hiveConf);
}
try {
updateTableSchema(client, table, jobInfo.getOutputSchema());
LOG.info("HAR is being used. The table {} has new partitions {}.", table.getTableName(), ptnInfos);
client.add_partitions(partitionsToAdd);
partitionsAdded = partitionsToAdd;
} catch (Exception e){
// There was an error adding partitions : rollback fs copy and rethrow
for (Partition p : partitionsToAdd){
Path ptnPath = new Path(harProcessor.getParentFSPath(new Path(p.getSd().getLocation())));
if (fs.exists(ptnPath)){
fs.delete(ptnPath,true);
}
}
throw e;
}
}else{
// no harProcessor, regular operation
updateTableSchema(client, table, jobInfo.getOutputSchema());
LOG.info("HAR not is not being used. The table {} has new partitions {}.", table.getTableName(), ptnInfos);
if (partitionsToAdd.size() > 0){
if (!dynamicPartitioningUsed ) {
// regular single-partition write into a partitioned table.
//Move data from temp directory the actual table directory
if (partitionsToAdd.size() > 1){
throw new HCatException(ErrorType.ERROR_PUBLISHING_PARTITION,
"More than one partition to publish in non-dynamic partitioning job");
}
Partition p = partitionsToAdd.get(0);
Path src = new Path(jobInfo.getLocation());
Path dest = new Path(p.getSd().getLocation());
moveTaskOutputs(fs, src, src, dest, true, table.isImmutable());
moveTaskOutputs(fs,src,src,dest,false,table.isImmutable());
if (!src.equals(dest)){
if (src.toString().matches(".*" + Path.SEPARATOR + SCRATCH_DIR_NAME + "\\d\\.?\\d+.*")){
// src is scratch directory, need to trim the part key value pairs from path
String diff = StringUtils.difference(src.toString(), dest.toString());
fs.delete(new Path(StringUtils.substringBefore(src.toString(), diff)), true);
} else {
fs.delete(src, true);
}
}
// Now, we check if the partition already exists. If not, we go ahead.
// If so, we error out if immutable, and if mutable, check that the partition's IF
// matches our current job's IF (table's IF) to check for compatibility. If compatible, we
// ignore and do not add. If incompatible, we error out again.
boolean publishRequired = false;
try {
Partition existingP = client.getPartition(p.getDbName(),p.getTableName(),p.getValues());
if (existingP != null){
if (table.isImmutable()){
throw new HCatException(ErrorType.ERROR_DUPLICATE_PARTITION,
"Attempted duplicate partition publish on to immutable table");
} else {
if (! existingP.getSd().getInputFormat().equals(table.getInputFormatClass().getName())){
throw new HCatException(ErrorType.ERROR_PUBLISHING_PARTITION,
"Attempted partition append, where old partition format was "
+ existingP.getSd().getInputFormat()
+ " and table format was "
+ table.getInputFormatClass().getName());
}
}
} else {
publishRequired = true;
}
} catch (NoSuchObjectException e){
// All good, no such partition exists, move on.
publishRequired = true;
}
if (publishRequired){
client.add_partitions(partitionsToAdd);
partitionsAdded = partitionsToAdd;
}
} else {
// Dynamic partitioning usecase
if (!customDynamicLocationUsed) {
Path src = new Path(ptnRootLocation);
moveTaskOutputs(fs, src, src, tblPath, true, true); // dryRun = true, immutable = true
moveTaskOutputs(fs, src, src, tblPath, false, true);
if (!src.equals(tblPath)){
fs.delete(src, true);
}
} else {
moveCustomLocationTaskOutputs(fs, table, hiveConf);
}
client.add_partitions(partitionsToAdd);
partitionsAdded = partitionsToAdd;
}
}
// Set permissions appropriately for each of the partitions we just created
// so as to have their permissions mimic the table permissions
for (Partition p : partitionsAdded){
applyGroupAndPerms(fs,new Path(p.getSd().getLocation()),tblStat.getPermission(),tblStat.getGroup(),true);
}
}
} catch (Exception e) {
if (partitionsAdded.size() > 0) {
try {
// baseCommitter.cleanupJob failed, try to clean up the
// metastore
for (Partition p : partitionsAdded) {
client.dropPartition(tableInfo.getDatabaseName(),
tableInfo.getTableName(), p.getValues(), true);
}
} catch (Exception te) {
// Keep cause as the original exception
throw new HCatException(
ErrorType.ERROR_PUBLISHING_PARTITION, e);
}
}
if (e instanceof HCatException) {
throw (HCatException) e;
} else {
throw new HCatException(ErrorType.ERROR_PUBLISHING_PARTITION, e);
}
} finally {
HCatUtil.closeHiveClientQuietly(client);
}
}
private void moveCustomLocationTaskOutputs(FileSystem fs, Table table, Configuration conf)
throws IOException {
// in case of custom dynamic partitions, we can't just move the sub-tree of partition root
// directory since the partitions location contain regex pattern. We need to first find the
// final destination of each partition and move its output.
for (Entry<String, Map<String, String>> entry : partitionsDiscoveredByPath.entrySet()) {
Path src = new Path(entry.getKey());
Path destPath = new Path(getFinalDynamicPartitionDestination(table, entry.getValue(), jobInfo));
moveTaskOutputs(fs, src, src, destPath, true, true); // dryRun = true, immutable = true
moveTaskOutputs(fs, src, src, destPath, false, true);
}
// delete the parent temp directory of all custom dynamic partitions
Path parentPath = new Path(getCustomPartitionRootLocation(jobInfo, conf));
if (fs.exists(parentPath)) {
fs.delete(parentPath, true);
}
}
private void cancelDelegationTokens(JobContext context) throws IOException{
LOG.info("Cancelling delegation token for the job.");
IMetaStoreClient client = null;
try {
HiveConf hiveConf = HCatUtil
.getHiveConf(context.getConfiguration());
client = HCatUtil.getHiveMetastoreClient(hiveConf);
// cancel the deleg. tokens that were acquired for this job now that
// we are done - we should cancel if the tokens were acquired by
// HCatOutputFormat and not if they were supplied by Oozie.
// In the latter case the HCAT_KEY_TOKEN_SIGNATURE property in
// the conf will not be set
String tokenStrForm = client.getTokenStrForm();
if (tokenStrForm != null
&& context.getConfiguration().get(
HCatConstants.HCAT_KEY_TOKEN_SIGNATURE) != null) {
client.cancelDelegationToken(tokenStrForm);
}
} catch (MetaException e) {
LOG.warn("MetaException while cancelling delegation token.", e);
} catch (TException e) {
LOG.warn("TException while cancelling delegation token.", e);
} finally {
HCatUtil.closeHiveClientQuietly(client);
}
}
}