/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.io;
import java.io.IOException;
import java.nio.file.FileSystemNotFoundException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Properties;
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hive.common.FileUtils;
import org.apache.hadoop.hive.common.JavaUtils;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.plan.FileSinkDesc;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
import org.apache.hadoop.hive.ql.plan.PartitionDesc;
import org.apache.hadoop.hive.ql.plan.TableDesc;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobContext;
import org.apache.hadoop.mapred.OutputCommitter;
import org.apache.hadoop.mapred.OutputFormat;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.TaskAttemptContext;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapreduce.MRJobConfig;
import org.apache.hadoop.util.Shell;
import org.apache.hive.common.util.ReflectionUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.cache.Cache;
import com.google.common.cache.CacheBuilder;
import com.google.common.collect.ImmutableMap;
/**
* An util class for various Hive file format tasks.
* registerOutputFormatSubstitute(Class, Class) getOutputFormatSubstitute(Class)
* are added for backward compatibility. They return the newly added
* HiveOutputFormat for the older ones.
*
*/
public final class HiveFileFormatUtils {
private static final Logger LOG = LoggerFactory.getLogger(HiveFileFormatUtils.class);
public static class FileChecker {
// we don't have many file formats that implement InputFormatChecker. We won't be holding
// multiple instances of such classes
private static final int MAX_CACHE_SIZE = 16;
// immutable maps
Map<Class<? extends InputFormat>, Class<? extends InputFormatChecker>> inputFormatCheckerMap;
Map<Class<?>, Class<? extends OutputFormat>> outputFormatSubstituteMap;
// mutable thread-safe map to store instances
Cache<Class<? extends InputFormatChecker>, InputFormatChecker> inputFormatCheckerInstanceCache;
// classloader invokes this static block when its first loaded (lazy initialization).
// Class loading is thread safe.
private static class Factory {
static final FileChecker INSTANCE = new FileChecker();
}
public static FileChecker getInstance() {
return Factory.INSTANCE;
}
private FileChecker() {
// read-only maps (initialized once)
inputFormatCheckerMap = ImmutableMap
.<Class<? extends InputFormat>, Class<? extends InputFormatChecker>>builder()
.put(SequenceFileInputFormat.class, SequenceFileInputFormatChecker.class)
.put(RCFileInputFormat.class, RCFileInputFormat.class)
.put(OrcInputFormat.class, OrcInputFormat.class)
.build();
outputFormatSubstituteMap = ImmutableMap
.<Class<?>, Class<? extends OutputFormat>>builder()
.put(IgnoreKeyTextOutputFormat.class, HiveIgnoreKeyTextOutputFormat.class)
.put(SequenceFileOutputFormat.class, HiveSequenceFileOutputFormat.class)
.build();
// updatable map that holds instances of the class
inputFormatCheckerInstanceCache = CacheBuilder.newBuilder().maximumSize(MAX_CACHE_SIZE)
.build();
}
public Set<Class<? extends InputFormat>> registeredClasses() {
return inputFormatCheckerMap.keySet();
}
public Class<? extends OutputFormat> getOutputFormatSubstiture(Class<?> origin) {
return outputFormatSubstituteMap.get(origin);
}
public Class<? extends InputFormatChecker> getInputFormatCheckerClass(Class<?> inputFormat) {
return inputFormatCheckerMap.get(inputFormat);
}
public void putInputFormatCheckerInstance(
Class<? extends InputFormatChecker> checkerCls, InputFormatChecker instanceCls) {
inputFormatCheckerInstanceCache.put(checkerCls, instanceCls);
}
public InputFormatChecker getInputFormatCheckerInstance(
Class<? extends InputFormatChecker> checkerCls) {
return inputFormatCheckerInstanceCache.getIfPresent(checkerCls);
}
}
/**
* get a OutputFormat's substitute HiveOutputFormat.
*/
@SuppressWarnings("unchecked")
public static Class<? extends OutputFormat> getOutputFormatSubstitute(
Class<?> origin) {
if (origin == null || HiveOutputFormat.class.isAssignableFrom(origin)) {
return (Class<? extends OutputFormat>) origin; // hive native
}
Class<? extends OutputFormat> substitute = FileChecker.getInstance()
.getOutputFormatSubstiture(origin);
if (substitute != null) {
return substitute; // substituted
}
return (Class<? extends OutputFormat>) origin;
}
/**
* checks if files are in same format as the given input format.
*/
@SuppressWarnings("unchecked")
public static boolean checkInputFormat(FileSystem fs, HiveConf conf,
Class<? extends InputFormat> inputFormatCls, List<FileStatus> files)
throws HiveException {
if (files.isEmpty()) return false;
Class<? extends InputFormatChecker> checkerCls = FileChecker.getInstance()
.getInputFormatCheckerClass(inputFormatCls);
if (checkerCls == null
&& inputFormatCls.isAssignableFrom(TextInputFormat.class)) {
// we get a text input format here, we can not determine a file is text
// according to its content, so we can do is to test if other file
// format can accept it. If one other file format can accept this file,
// we treat this file as text file, although it maybe not.
return checkTextInputFormat(fs, conf, files);
}
if (checkerCls != null) {
InputFormatChecker checkerInstance = FileChecker.getInstance()
.getInputFormatCheckerInstance(checkerCls);
try {
if (checkerInstance == null) {
checkerInstance = checkerCls.newInstance();
FileChecker.getInstance().putInputFormatCheckerInstance(checkerCls, checkerInstance);
}
return checkerInstance.validateInput(fs, conf, files);
} catch (Exception e) {
throw new HiveException(e);
}
}
return true;
}
@SuppressWarnings("unchecked")
private static boolean checkTextInputFormat(FileSystem fs, HiveConf conf,
List<FileStatus> files) throws HiveException {
List<FileStatus> files2 = new LinkedList<>(files);
Iterator<FileStatus> iter = files2.iterator();
while (iter.hasNext()) {
FileStatus file = iter.next();
if (file == null) continue;
if (isPipe(fs, file)) {
LOG.info("Skipping format check for " + file.getPath() + " as it is a pipe");
iter.remove();
}
}
if (files2.isEmpty()) return true;
Set<Class<? extends InputFormat>> inputFormatter = FileChecker.getInstance().registeredClasses();
for (Class<? extends InputFormat> reg : inputFormatter) {
boolean result = checkInputFormat(fs, conf, reg, files2);
if (result) {
return false;
}
}
return true;
}
// See include/uapi/linux/stat.h
private static final int S_IFIFO = 0010000;
private static boolean isPipe(FileSystem fs, FileStatus file) {
if (fs instanceof DistributedFileSystem) {
return false; // Shortcut for HDFS.
}
int mode = 0;
Object pathToLog = file.getPath();
try {
java.nio.file.Path realPath = Paths.get(file.getPath().toUri());
pathToLog = realPath;
mode = (Integer)Files.getAttribute(realPath, "unix:mode");
} catch (FileSystemNotFoundException t) {
return false; // Probably not a local filesystem; no need to check.
} catch (UnsupportedOperationException | IOException
| SecurityException | IllegalArgumentException t) {
LOG.info("Failed to check mode for " + pathToLog + ": "
+ t.getMessage() + " (" + t.getClass() + ")");
return false;
}
return (mode & S_IFIFO) != 0;
}
public static RecordWriter getHiveRecordWriter(JobConf jc,
TableDesc tableInfo, Class<? extends Writable> outputClass,
FileSinkDesc conf, Path outPath, Reporter reporter) throws HiveException {
HiveOutputFormat<?, ?> hiveOutputFormat = getHiveOutputFormat(jc, tableInfo);
try {
boolean isCompressed = conf.getCompressed();
JobConf jc_output = jc;
if (isCompressed) {
jc_output = new JobConf(jc);
String codecStr = conf.getCompressCodec();
if (codecStr != null && !codecStr.trim().equals("")) {
Class<? extends CompressionCodec> codec =
(Class<? extends CompressionCodec>) JavaUtils.loadClass(codecStr);
FileOutputFormat.setOutputCompressorClass(jc_output, codec);
}
String type = conf.getCompressType();
if (type != null && !type.trim().equals("")) {
CompressionType style = CompressionType.valueOf(type);
SequenceFileOutputFormat.setOutputCompressionType(jc, style);
}
}
return getRecordWriter(jc_output, hiveOutputFormat, outputClass,
isCompressed, tableInfo.getProperties(), outPath, reporter);
} catch (Exception e) {
throw new HiveException(e);
}
}
public static RecordWriter getRecordWriter(JobConf jc,
OutputFormat<?, ?> outputFormat,
Class<? extends Writable> valueClass, boolean isCompressed,
Properties tableProp, Path outPath, Reporter reporter
) throws IOException, HiveException {
if (!(outputFormat instanceof HiveOutputFormat)) {
outputFormat = new HivePassThroughOutputFormat(outputFormat);
}
return ((HiveOutputFormat)outputFormat).getHiveRecordWriter(
jc, outPath, valueClass, isCompressed, tableProp, reporter);
}
public static HiveOutputFormat<?, ?> getHiveOutputFormat(Configuration conf, TableDesc tableDesc)
throws HiveException {
return getHiveOutputFormat(conf, tableDesc.getOutputFileFormatClass());
}
public static HiveOutputFormat<?, ?> getHiveOutputFormat(Configuration conf, PartitionDesc partDesc)
throws HiveException {
return getHiveOutputFormat(conf, partDesc.getOutputFileFormatClass());
}
private static HiveOutputFormat<?, ?> getHiveOutputFormat(
Configuration conf, Class<? extends OutputFormat> outputClass) throws HiveException {
OutputFormat<?, ?> outputFormat = ReflectionUtil.newInstance(outputClass, conf);
if (!(outputFormat instanceof HiveOutputFormat)) {
outputFormat = new HivePassThroughOutputFormat(outputFormat);
}
return (HiveOutputFormat<?, ?>) outputFormat;
}
public static RecordUpdater getAcidRecordUpdater(JobConf jc, TableDesc tableInfo, int bucket,
FileSinkDesc conf, Path outPath,
ObjectInspector inspector,
Reporter reporter, int rowIdColNum)
throws HiveException, IOException {
HiveOutputFormat<?, ?> hiveOutputFormat = getHiveOutputFormat(jc, tableInfo);
AcidOutputFormat<?, ?> acidOutputFormat = null;
if (hiveOutputFormat instanceof AcidOutputFormat) {
acidOutputFormat = (AcidOutputFormat)hiveOutputFormat;
} else {
throw new HiveException("Unable to create RecordUpdater for HiveOutputFormat that does not " +
"implement AcidOutputFormat");
}
// TODO not 100% sure about this. This call doesn't set the compression type in the conf
// file the way getHiveRecordWriter does, as ORC appears to read the value for itself. Not
// sure if this is correct or not.
return getRecordUpdater(jc, acidOutputFormat,
bucket, inspector, tableInfo.getProperties(), outPath, reporter, rowIdColNum, conf);
}
private static RecordUpdater getRecordUpdater(JobConf jc,
AcidOutputFormat<?, ?> acidOutputFormat,
int bucket,
ObjectInspector inspector,
Properties tableProp,
Path outPath,
Reporter reporter,
int rowIdColNum,
FileSinkDesc conf) throws IOException {
return acidOutputFormat.getRecordUpdater(outPath, new AcidOutputFormat.Options(jc)
.isCompressed(conf.getCompressed())
.tableProperties(tableProp)
.reporter(reporter)
.writingBase(false)
.minimumTransactionId(conf.getTransactionId())
.maximumTransactionId(conf.getTransactionId())
.bucket(bucket)
.inspector(inspector)
.recordIdColumn(rowIdColNum)
.statementId(conf.getStatementId())
.finalDestination(conf.getDestPath()));
}
public static PartitionDesc getPartitionDescFromPathRecursively(
Map<Path, PartitionDesc> pathToPartitionInfo, Path dir,
Map<Map<Path, PartitionDesc>, Map<Path, PartitionDesc>> cacheMap)
throws IOException {
return getPartitionDescFromPathRecursively(pathToPartitionInfo, dir,
cacheMap, false);
}
public static PartitionDesc getPartitionDescFromPathRecursively(
Map<Path, PartitionDesc> pathToPartitionInfo, Path dir,
Map<Map<Path, PartitionDesc>, Map<Path, PartitionDesc>> cacheMap, boolean ignoreSchema)
throws IOException {
PartitionDesc part = doGetPartitionDescFromPath(pathToPartitionInfo, dir);
if (part == null
&& (ignoreSchema
|| (dir.toUri().getScheme() == null || dir.toUri().getScheme().trim().equals(""))
|| FileUtils.pathsContainNoScheme(pathToPartitionInfo.keySet()))) {
Map<Path, PartitionDesc> newPathToPartitionInfo = null;
if (cacheMap != null) {
newPathToPartitionInfo = cacheMap.get(pathToPartitionInfo);
}
if (newPathToPartitionInfo == null) { // still null
newPathToPartitionInfo = populateNewPartitionDesc(pathToPartitionInfo);
if (cacheMap != null) {
cacheMap.put(pathToPartitionInfo, newPathToPartitionInfo);
}
}
part = doGetPartitionDescFromPath(newPathToPartitionInfo, dir);
}
if (part != null) {
return part;
} else {
throw new IOException("cannot find dir = " + dir.toString()
+ " in pathToPartitionInfo: " + pathToPartitionInfo.keySet());
}
}
private static Map<Path, PartitionDesc> populateNewPartitionDesc(Map<Path, PartitionDesc> pathToPartitionInfo) {
Map<Path, PartitionDesc> newPathToPartitionInfo = new HashMap<>();
for (Map.Entry<Path, PartitionDesc> entry: pathToPartitionInfo.entrySet()) {
PartitionDesc partDesc = entry.getValue();
Path pathOnly = Path.getPathWithoutSchemeAndAuthority(entry.getKey());
newPathToPartitionInfo.put(pathOnly, partDesc);
}
return newPathToPartitionInfo;
}
private static PartitionDesc doGetPartitionDescFromPath(
Map<Path, PartitionDesc> pathToPartitionInfo, Path dir) {
// We first do exact match, and then do prefix matching. The latter is due to input dir
// could be /dir/ds='2001-02-21'/part-03 where part-03 is not part of partition
Path path = FileUtils.getParentRegardlessOfScheme(dir,pathToPartitionInfo.keySet());
if(path == null) {
// FIXME: old implementation returned null; exception maybe?
return null;
}
return pathToPartitionInfo.get(path);
}
private static boolean foundAlias(Map<Path, ArrayList<String>> pathToAliases,
Path path) {
List<String> aliases = pathToAliases.get(path);
if ((aliases == null) || (aliases.isEmpty())) {
return false;
}
return true;
}
private static Path getMatchingPath(Map<Path, ArrayList<String>> pathToAliases,
Path dir) {
// First find the path to be searched
Path path = dir;
if (foundAlias(pathToAliases, path)) {
return path;
}
Path dirPath = Path.getPathWithoutSchemeAndAuthority(dir);
if (foundAlias(pathToAliases, dirPath)) {
return dirPath;
}
while (path!=null && dirPath!=null) {
path=path.getParent();
dirPath=dirPath.getParent();
//first try full match
if (foundAlias(pathToAliases, path)) {
return path;
}
if (foundAlias(pathToAliases, dirPath)) {
return dirPath;
}
}
return null;
}
/**
* Get the list of operators from the operator tree that are needed for the path
* @param pathToAliases mapping from path to aliases
* @param aliasToWork The operator tree to be invoked for a given alias
* @param dir The path to look for
**/
public static List<Operator<? extends OperatorDesc>> doGetWorksFromPath(
Map<Path, ArrayList<String>> pathToAliases,
Map<String, Operator<? extends OperatorDesc>> aliasToWork, Path dir) {
List<Operator<? extends OperatorDesc>> opList =
new ArrayList<Operator<? extends OperatorDesc>>();
List<String> aliases = doGetAliasesFromPath(pathToAliases, dir);
for (String alias : aliases) {
opList.add(aliasToWork.get(alias));
}
return opList;
}
/**
* Get the list of aliases from the opeerator tree that are needed for the path
* @param pathToAliases mapping from path to aliases
* @param dir The path to look for
**/
public static List<String> doGetAliasesFromPath(
Map<Path, ArrayList<String>> pathToAliases,
Path dir) {
if (pathToAliases == null) {
return new ArrayList<String>();
}
Path path = getMatchingPath(pathToAliases, dir);
return pathToAliases.get(path);
}
private HiveFileFormatUtils() {
// prevent instantiation
}
public static class NullOutputCommitter extends OutputCommitter {
@Override
public void setupJob(JobContext jobContext) { }
@Override
public void cleanupJob(JobContext jobContext) { }
@Override
public void setupTask(TaskAttemptContext taskContext) { }
@Override
public boolean needsTaskCommit(TaskAttemptContext taskContext) {
return false;
}
@Override
public void commitTask(TaskAttemptContext taskContext) { }
@Override
public void abortTask(TaskAttemptContext taskContext) { }
}
/**
* Hive uses side effect files exclusively for it's output. It also manages
* the setup/cleanup/commit of output from the hive client. As a result it does
* not need support for the same inside the MR framework
*
* This routine sets the appropriate options related to bypass setup/cleanup/commit
* support in the MR framework, but does not set the OutputFormat class.
*/
public static void prepareJobOutput(JobConf conf) {
conf.setOutputCommitter(NullOutputCommitter.class);
// option to bypass job setup and cleanup was introduced in hadoop-21 (MAPREDUCE-463)
// but can be backported. So we disable setup/cleanup in all versions >= 0.19
conf.setBoolean(MRJobConfig.SETUP_CLEANUP_NEEDED, false);
// option to bypass task cleanup task was introduced in hadoop-23 (MAPREDUCE-2206)
// but can be backported. So we disable setup/cleanup in all versions >= 0.19
conf.setBoolean(MRJobConfig.TASK_CLEANUP_NEEDED, false);
}
}