/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.io; import java.io.IOException; import java.nio.file.FileSystemNotFoundException; import java.nio.file.Files; import java.nio.file.Paths; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Properties; import java.util.Set; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hdfs.DistributedFileSystem; import org.apache.hadoop.hive.common.FileUtils; import org.apache.hadoop.hive.common.JavaUtils; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter; import org.apache.hadoop.hive.ql.exec.Operator; import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.FileSinkDesc; import org.apache.hadoop.hive.ql.plan.OperatorDesc; import org.apache.hadoop.hive.ql.plan.PartitionDesc; import org.apache.hadoop.hive.ql.plan.TableDesc; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.io.SequenceFile.CompressionType; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.InputFormat; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.JobContext; import org.apache.hadoop.mapred.OutputCommitter; import org.apache.hadoop.mapred.OutputFormat; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.hadoop.mapred.TaskAttemptContext; import org.apache.hadoop.mapred.TextInputFormat; import org.apache.hadoop.mapreduce.MRJobConfig; import org.apache.hadoop.util.Shell; import org.apache.hive.common.util.ReflectionUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.cache.Cache; import com.google.common.cache.CacheBuilder; import com.google.common.collect.ImmutableMap; /** * An util class for various Hive file format tasks. * registerOutputFormatSubstitute(Class, Class) getOutputFormatSubstitute(Class) * are added for backward compatibility. They return the newly added * HiveOutputFormat for the older ones. * */ public final class HiveFileFormatUtils { private static final Logger LOG = LoggerFactory.getLogger(HiveFileFormatUtils.class); public static class FileChecker { // we don't have many file formats that implement InputFormatChecker. We won't be holding // multiple instances of such classes private static final int MAX_CACHE_SIZE = 16; // immutable maps Map<Class<? extends InputFormat>, Class<? extends InputFormatChecker>> inputFormatCheckerMap; Map<Class<?>, Class<? extends OutputFormat>> outputFormatSubstituteMap; // mutable thread-safe map to store instances Cache<Class<? extends InputFormatChecker>, InputFormatChecker> inputFormatCheckerInstanceCache; // classloader invokes this static block when its first loaded (lazy initialization). // Class loading is thread safe. private static class Factory { static final FileChecker INSTANCE = new FileChecker(); } public static FileChecker getInstance() { return Factory.INSTANCE; } private FileChecker() { // read-only maps (initialized once) inputFormatCheckerMap = ImmutableMap .<Class<? extends InputFormat>, Class<? extends InputFormatChecker>>builder() .put(SequenceFileInputFormat.class, SequenceFileInputFormatChecker.class) .put(RCFileInputFormat.class, RCFileInputFormat.class) .put(OrcInputFormat.class, OrcInputFormat.class) .build(); outputFormatSubstituteMap = ImmutableMap .<Class<?>, Class<? extends OutputFormat>>builder() .put(IgnoreKeyTextOutputFormat.class, HiveIgnoreKeyTextOutputFormat.class) .put(SequenceFileOutputFormat.class, HiveSequenceFileOutputFormat.class) .build(); // updatable map that holds instances of the class inputFormatCheckerInstanceCache = CacheBuilder.newBuilder().maximumSize(MAX_CACHE_SIZE) .build(); } public Set<Class<? extends InputFormat>> registeredClasses() { return inputFormatCheckerMap.keySet(); } public Class<? extends OutputFormat> getOutputFormatSubstiture(Class<?> origin) { return outputFormatSubstituteMap.get(origin); } public Class<? extends InputFormatChecker> getInputFormatCheckerClass(Class<?> inputFormat) { return inputFormatCheckerMap.get(inputFormat); } public void putInputFormatCheckerInstance( Class<? extends InputFormatChecker> checkerCls, InputFormatChecker instanceCls) { inputFormatCheckerInstanceCache.put(checkerCls, instanceCls); } public InputFormatChecker getInputFormatCheckerInstance( Class<? extends InputFormatChecker> checkerCls) { return inputFormatCheckerInstanceCache.getIfPresent(checkerCls); } } /** * get a OutputFormat's substitute HiveOutputFormat. */ @SuppressWarnings("unchecked") public static Class<? extends OutputFormat> getOutputFormatSubstitute( Class<?> origin) { if (origin == null || HiveOutputFormat.class.isAssignableFrom(origin)) { return (Class<? extends OutputFormat>) origin; // hive native } Class<? extends OutputFormat> substitute = FileChecker.getInstance() .getOutputFormatSubstiture(origin); if (substitute != null) { return substitute; // substituted } return (Class<? extends OutputFormat>) origin; } /** * checks if files are in same format as the given input format. */ @SuppressWarnings("unchecked") public static boolean checkInputFormat(FileSystem fs, HiveConf conf, Class<? extends InputFormat> inputFormatCls, List<FileStatus> files) throws HiveException { if (files.isEmpty()) return false; Class<? extends InputFormatChecker> checkerCls = FileChecker.getInstance() .getInputFormatCheckerClass(inputFormatCls); if (checkerCls == null && inputFormatCls.isAssignableFrom(TextInputFormat.class)) { // we get a text input format here, we can not determine a file is text // according to its content, so we can do is to test if other file // format can accept it. If one other file format can accept this file, // we treat this file as text file, although it maybe not. return checkTextInputFormat(fs, conf, files); } if (checkerCls != null) { InputFormatChecker checkerInstance = FileChecker.getInstance() .getInputFormatCheckerInstance(checkerCls); try { if (checkerInstance == null) { checkerInstance = checkerCls.newInstance(); FileChecker.getInstance().putInputFormatCheckerInstance(checkerCls, checkerInstance); } return checkerInstance.validateInput(fs, conf, files); } catch (Exception e) { throw new HiveException(e); } } return true; } @SuppressWarnings("unchecked") private static boolean checkTextInputFormat(FileSystem fs, HiveConf conf, List<FileStatus> files) throws HiveException { List<FileStatus> files2 = new LinkedList<>(files); Iterator<FileStatus> iter = files2.iterator(); while (iter.hasNext()) { FileStatus file = iter.next(); if (file == null) continue; if (isPipe(fs, file)) { LOG.info("Skipping format check for " + file.getPath() + " as it is a pipe"); iter.remove(); } } if (files2.isEmpty()) return true; Set<Class<? extends InputFormat>> inputFormatter = FileChecker.getInstance().registeredClasses(); for (Class<? extends InputFormat> reg : inputFormatter) { boolean result = checkInputFormat(fs, conf, reg, files2); if (result) { return false; } } return true; } // See include/uapi/linux/stat.h private static final int S_IFIFO = 0010000; private static boolean isPipe(FileSystem fs, FileStatus file) { if (fs instanceof DistributedFileSystem) { return false; // Shortcut for HDFS. } int mode = 0; Object pathToLog = file.getPath(); try { java.nio.file.Path realPath = Paths.get(file.getPath().toUri()); pathToLog = realPath; mode = (Integer)Files.getAttribute(realPath, "unix:mode"); } catch (FileSystemNotFoundException t) { return false; // Probably not a local filesystem; no need to check. } catch (UnsupportedOperationException | IOException | SecurityException | IllegalArgumentException t) { LOG.info("Failed to check mode for " + pathToLog + ": " + t.getMessage() + " (" + t.getClass() + ")"); return false; } return (mode & S_IFIFO) != 0; } public static RecordWriter getHiveRecordWriter(JobConf jc, TableDesc tableInfo, Class<? extends Writable> outputClass, FileSinkDesc conf, Path outPath, Reporter reporter) throws HiveException { HiveOutputFormat<?, ?> hiveOutputFormat = getHiveOutputFormat(jc, tableInfo); try { boolean isCompressed = conf.getCompressed(); JobConf jc_output = jc; if (isCompressed) { jc_output = new JobConf(jc); String codecStr = conf.getCompressCodec(); if (codecStr != null && !codecStr.trim().equals("")) { Class<? extends CompressionCodec> codec = (Class<? extends CompressionCodec>) JavaUtils.loadClass(codecStr); FileOutputFormat.setOutputCompressorClass(jc_output, codec); } String type = conf.getCompressType(); if (type != null && !type.trim().equals("")) { CompressionType style = CompressionType.valueOf(type); SequenceFileOutputFormat.setOutputCompressionType(jc, style); } } return getRecordWriter(jc_output, hiveOutputFormat, outputClass, isCompressed, tableInfo.getProperties(), outPath, reporter); } catch (Exception e) { throw new HiveException(e); } } public static RecordWriter getRecordWriter(JobConf jc, OutputFormat<?, ?> outputFormat, Class<? extends Writable> valueClass, boolean isCompressed, Properties tableProp, Path outPath, Reporter reporter ) throws IOException, HiveException { if (!(outputFormat instanceof HiveOutputFormat)) { outputFormat = new HivePassThroughOutputFormat(outputFormat); } return ((HiveOutputFormat)outputFormat).getHiveRecordWriter( jc, outPath, valueClass, isCompressed, tableProp, reporter); } public static HiveOutputFormat<?, ?> getHiveOutputFormat(Configuration conf, TableDesc tableDesc) throws HiveException { return getHiveOutputFormat(conf, tableDesc.getOutputFileFormatClass()); } public static HiveOutputFormat<?, ?> getHiveOutputFormat(Configuration conf, PartitionDesc partDesc) throws HiveException { return getHiveOutputFormat(conf, partDesc.getOutputFileFormatClass()); } private static HiveOutputFormat<?, ?> getHiveOutputFormat( Configuration conf, Class<? extends OutputFormat> outputClass) throws HiveException { OutputFormat<?, ?> outputFormat = ReflectionUtil.newInstance(outputClass, conf); if (!(outputFormat instanceof HiveOutputFormat)) { outputFormat = new HivePassThroughOutputFormat(outputFormat); } return (HiveOutputFormat<?, ?>) outputFormat; } public static RecordUpdater getAcidRecordUpdater(JobConf jc, TableDesc tableInfo, int bucket, FileSinkDesc conf, Path outPath, ObjectInspector inspector, Reporter reporter, int rowIdColNum) throws HiveException, IOException { HiveOutputFormat<?, ?> hiveOutputFormat = getHiveOutputFormat(jc, tableInfo); AcidOutputFormat<?, ?> acidOutputFormat = null; if (hiveOutputFormat instanceof AcidOutputFormat) { acidOutputFormat = (AcidOutputFormat)hiveOutputFormat; } else { throw new HiveException("Unable to create RecordUpdater for HiveOutputFormat that does not " + "implement AcidOutputFormat"); } // TODO not 100% sure about this. This call doesn't set the compression type in the conf // file the way getHiveRecordWriter does, as ORC appears to read the value for itself. Not // sure if this is correct or not. return getRecordUpdater(jc, acidOutputFormat, bucket, inspector, tableInfo.getProperties(), outPath, reporter, rowIdColNum, conf); } private static RecordUpdater getRecordUpdater(JobConf jc, AcidOutputFormat<?, ?> acidOutputFormat, int bucket, ObjectInspector inspector, Properties tableProp, Path outPath, Reporter reporter, int rowIdColNum, FileSinkDesc conf) throws IOException { return acidOutputFormat.getRecordUpdater(outPath, new AcidOutputFormat.Options(jc) .isCompressed(conf.getCompressed()) .tableProperties(tableProp) .reporter(reporter) .writingBase(false) .minimumTransactionId(conf.getTransactionId()) .maximumTransactionId(conf.getTransactionId()) .bucket(bucket) .inspector(inspector) .recordIdColumn(rowIdColNum) .statementId(conf.getStatementId()) .finalDestination(conf.getDestPath())); } public static PartitionDesc getPartitionDescFromPathRecursively( Map<Path, PartitionDesc> pathToPartitionInfo, Path dir, Map<Map<Path, PartitionDesc>, Map<Path, PartitionDesc>> cacheMap) throws IOException { return getPartitionDescFromPathRecursively(pathToPartitionInfo, dir, cacheMap, false); } public static PartitionDesc getPartitionDescFromPathRecursively( Map<Path, PartitionDesc> pathToPartitionInfo, Path dir, Map<Map<Path, PartitionDesc>, Map<Path, PartitionDesc>> cacheMap, boolean ignoreSchema) throws IOException { PartitionDesc part = doGetPartitionDescFromPath(pathToPartitionInfo, dir); if (part == null && (ignoreSchema || (dir.toUri().getScheme() == null || dir.toUri().getScheme().trim().equals("")) || FileUtils.pathsContainNoScheme(pathToPartitionInfo.keySet()))) { Map<Path, PartitionDesc> newPathToPartitionInfo = null; if (cacheMap != null) { newPathToPartitionInfo = cacheMap.get(pathToPartitionInfo); } if (newPathToPartitionInfo == null) { // still null newPathToPartitionInfo = populateNewPartitionDesc(pathToPartitionInfo); if (cacheMap != null) { cacheMap.put(pathToPartitionInfo, newPathToPartitionInfo); } } part = doGetPartitionDescFromPath(newPathToPartitionInfo, dir); } if (part != null) { return part; } else { throw new IOException("cannot find dir = " + dir.toString() + " in pathToPartitionInfo: " + pathToPartitionInfo.keySet()); } } private static Map<Path, PartitionDesc> populateNewPartitionDesc(Map<Path, PartitionDesc> pathToPartitionInfo) { Map<Path, PartitionDesc> newPathToPartitionInfo = new HashMap<>(); for (Map.Entry<Path, PartitionDesc> entry: pathToPartitionInfo.entrySet()) { PartitionDesc partDesc = entry.getValue(); Path pathOnly = Path.getPathWithoutSchemeAndAuthority(entry.getKey()); newPathToPartitionInfo.put(pathOnly, partDesc); } return newPathToPartitionInfo; } private static PartitionDesc doGetPartitionDescFromPath( Map<Path, PartitionDesc> pathToPartitionInfo, Path dir) { // We first do exact match, and then do prefix matching. The latter is due to input dir // could be /dir/ds='2001-02-21'/part-03 where part-03 is not part of partition Path path = FileUtils.getParentRegardlessOfScheme(dir,pathToPartitionInfo.keySet()); if(path == null) { // FIXME: old implementation returned null; exception maybe? return null; } return pathToPartitionInfo.get(path); } private static boolean foundAlias(Map<Path, ArrayList<String>> pathToAliases, Path path) { List<String> aliases = pathToAliases.get(path); if ((aliases == null) || (aliases.isEmpty())) { return false; } return true; } private static Path getMatchingPath(Map<Path, ArrayList<String>> pathToAliases, Path dir) { // First find the path to be searched Path path = dir; if (foundAlias(pathToAliases, path)) { return path; } Path dirPath = Path.getPathWithoutSchemeAndAuthority(dir); if (foundAlias(pathToAliases, dirPath)) { return dirPath; } while (path!=null && dirPath!=null) { path=path.getParent(); dirPath=dirPath.getParent(); //first try full match if (foundAlias(pathToAliases, path)) { return path; } if (foundAlias(pathToAliases, dirPath)) { return dirPath; } } return null; } /** * Get the list of operators from the operator tree that are needed for the path * @param pathToAliases mapping from path to aliases * @param aliasToWork The operator tree to be invoked for a given alias * @param dir The path to look for **/ public static List<Operator<? extends OperatorDesc>> doGetWorksFromPath( Map<Path, ArrayList<String>> pathToAliases, Map<String, Operator<? extends OperatorDesc>> aliasToWork, Path dir) { List<Operator<? extends OperatorDesc>> opList = new ArrayList<Operator<? extends OperatorDesc>>(); List<String> aliases = doGetAliasesFromPath(pathToAliases, dir); for (String alias : aliases) { opList.add(aliasToWork.get(alias)); } return opList; } /** * Get the list of aliases from the opeerator tree that are needed for the path * @param pathToAliases mapping from path to aliases * @param dir The path to look for **/ public static List<String> doGetAliasesFromPath( Map<Path, ArrayList<String>> pathToAliases, Path dir) { if (pathToAliases == null) { return new ArrayList<String>(); } Path path = getMatchingPath(pathToAliases, dir); return pathToAliases.get(path); } private HiveFileFormatUtils() { // prevent instantiation } public static class NullOutputCommitter extends OutputCommitter { @Override public void setupJob(JobContext jobContext) { } @Override public void cleanupJob(JobContext jobContext) { } @Override public void setupTask(TaskAttemptContext taskContext) { } @Override public boolean needsTaskCommit(TaskAttemptContext taskContext) { return false; } @Override public void commitTask(TaskAttemptContext taskContext) { } @Override public void abortTask(TaskAttemptContext taskContext) { } } /** * Hive uses side effect files exclusively for it's output. It also manages * the setup/cleanup/commit of output from the hive client. As a result it does * not need support for the same inside the MR framework * * This routine sets the appropriate options related to bypass setup/cleanup/commit * support in the MR framework, but does not set the OutputFormat class. */ public static void prepareJobOutput(JobConf conf) { conf.setOutputCommitter(NullOutputCommitter.class); // option to bypass job setup and cleanup was introduced in hadoop-21 (MAPREDUCE-463) // but can be backported. So we disable setup/cleanup in all versions >= 0.19 conf.setBoolean(MRJobConfig.SETUP_CLEANUP_NEEDED, false); // option to bypass task cleanup task was introduced in hadoop-23 (MAPREDUCE-2206) // but can be backported. So we disable setup/cleanup in all versions >= 0.19 conf.setBoolean(MRJobConfig.TASK_CLEANUP_NEEDED, false); } }