/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.io; import java.io.IOException; import java.io.Serializable; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.Set; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.exec.Operator; import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.FileSinkDesc; import org.apache.hadoop.hive.ql.plan.PartitionDesc; import org.apache.hadoop.hive.ql.plan.TableDesc; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.SequenceFile.CompressionType; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.InputFormat; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.OutputFormat; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.hadoop.mapred.TextInputFormat; /** * An util class for various Hive file format tasks. * registerOutputFormatSubstitute(Class, Class) getOutputFormatSubstitute(Class) * are added for backward compatibility. They return the newly added * HiveOutputFormat for the older ones. * */ public final class HiveFileFormatUtils { static { outputFormatSubstituteMap = new HashMap<Class<? extends OutputFormat>, Class<? extends HiveOutputFormat>>(); HiveFileFormatUtils.registerOutputFormatSubstitute( IgnoreKeyTextOutputFormat.class, HiveIgnoreKeyTextOutputFormat.class); HiveFileFormatUtils.registerOutputFormatSubstitute( SequenceFileOutputFormat.class, HiveSequenceFileOutputFormat.class); } @SuppressWarnings("unchecked") private static Map<Class<? extends OutputFormat>, Class<? extends HiveOutputFormat>> outputFormatSubstituteMap; /** * register a substitute. * * @param origin * the class that need to be substituted * @param substitute */ @SuppressWarnings("unchecked") public static synchronized void registerOutputFormatSubstitute( Class<? extends OutputFormat> origin, Class<? extends HiveOutputFormat> substitute) { outputFormatSubstituteMap.put(origin, substitute); } /** * get a OutputFormat's substitute HiveOutputFormat. */ @SuppressWarnings("unchecked") public static synchronized Class<? extends HiveOutputFormat> getOutputFormatSubstitute( Class<?> origin) { if (HiveOutputFormat.class.isAssignableFrom(origin)) { return (Class<? extends HiveOutputFormat>) origin; } Class<? extends HiveOutputFormat> result = outputFormatSubstituteMap .get(origin); return result; } /** * get the final output path of a given FileOutputFormat. * * @param parent * parent dir of the expected final output path * @param jc * job configuration * @deprecated */ @Deprecated public static Path getOutputFormatFinalPath(Path parent, String taskId, JobConf jc, HiveOutputFormat<?, ?> hiveOutputFormat, boolean isCompressed, Path defaultFinalPath) throws IOException { if (hiveOutputFormat instanceof HiveIgnoreKeyTextOutputFormat) { return new Path(parent, taskId + Utilities.getFileExtension(jc, isCompressed)); } return defaultFinalPath; } static { inputFormatCheckerMap = new HashMap<Class<? extends InputFormat>, Class<? extends InputFormatChecker>>(); HiveFileFormatUtils.registerInputFormatChecker( SequenceFileInputFormat.class, SequenceFileInputFormatChecker.class); HiveFileFormatUtils.registerInputFormatChecker(RCFileInputFormat.class, RCFileInputFormat.class); inputFormatCheckerInstanceCache = new HashMap<Class<? extends InputFormatChecker>, InputFormatChecker>(); } @SuppressWarnings("unchecked") private static Map<Class<? extends InputFormat>, Class<? extends InputFormatChecker>> inputFormatCheckerMap; private static Map<Class<? extends InputFormatChecker>, InputFormatChecker> inputFormatCheckerInstanceCache; /** * register an InputFormatChecker for a given InputFormat. * * @param format * the class that need to be substituted * @param checker */ @SuppressWarnings("unchecked") public static synchronized void registerInputFormatChecker( Class<? extends InputFormat> format, Class<? extends InputFormatChecker> checker) { inputFormatCheckerMap.put(format, checker); } /** * get an InputFormatChecker for a file format. */ public static synchronized Class<? extends InputFormatChecker> getInputFormatChecker( Class<?> inputFormat) { Class<? extends InputFormatChecker> result = inputFormatCheckerMap .get(inputFormat); return result; } /** * checks if files are in same format as the given input format. */ @SuppressWarnings("unchecked") public static boolean checkInputFormat(FileSystem fs, HiveConf conf, Class<? extends InputFormat> inputFormatCls, ArrayList<FileStatus> files) throws HiveException { if (files.size() > 0) { Class<? extends InputFormatChecker> checkerCls = getInputFormatChecker(inputFormatCls); if (checkerCls == null && inputFormatCls.isAssignableFrom(TextInputFormat.class)) { // we get a text input format here, we can not determine a file is text // according to its content, so we can do is to test if other file // format can accept it. If one other file format can accept this file, // we treat this file as text file, although it maybe not. return checkTextInputFormat(fs, conf, files); } if (checkerCls != null) { InputFormatChecker checkerInstance = inputFormatCheckerInstanceCache .get(checkerCls); try { if (checkerInstance == null) { checkerInstance = checkerCls.newInstance(); inputFormatCheckerInstanceCache.put(checkerCls, checkerInstance); } return checkerInstance.validateInput(fs, conf, files); } catch (Exception e) { throw new HiveException(e); } } return true; } return false; } @SuppressWarnings("unchecked") private static boolean checkTextInputFormat(FileSystem fs, HiveConf conf, ArrayList<FileStatus> files) throws HiveException { Set<Class<? extends InputFormat>> inputFormatter = inputFormatCheckerMap .keySet(); for (Class<? extends InputFormat> reg : inputFormatter) { boolean result = checkInputFormat(fs, conf, reg, files); if (result) { return false; } } return true; } public static RecordWriter getHiveRecordWriter(JobConf jc, TableDesc tableInfo, Class<? extends Writable> outputClass, FileSinkDesc conf, Path outPath) throws HiveException { try { HiveOutputFormat<?, ?> hiveOutputFormat = tableInfo .getOutputFileFormatClass().newInstance(); boolean isCompressed = conf.getCompressed(); JobConf jc_output = jc; if (isCompressed) { jc_output = new JobConf(jc); String codecStr = conf.getCompressCodec(); if (codecStr != null && !codecStr.trim().equals("")) { Class<? extends CompressionCodec> codec = (Class<? extends CompressionCodec>) Class .forName(codecStr); FileOutputFormat.setOutputCompressorClass(jc_output, codec); } String type = conf.getCompressType(); if (type != null && !type.trim().equals("")) { CompressionType style = CompressionType.valueOf(type); SequenceFileOutputFormat.setOutputCompressionType(jc, style); } } return getRecordWriter(jc_output, hiveOutputFormat, outputClass, isCompressed, tableInfo.getProperties(), outPath); } catch (Exception e) { throw new HiveException(e); } } public static RecordWriter getRecordWriter(JobConf jc, HiveOutputFormat<?, ?> hiveOutputFormat, final Class<? extends Writable> valueClass, boolean isCompressed, Properties tableProp, Path outPath) throws IOException, HiveException { if (hiveOutputFormat != null) { return hiveOutputFormat.getHiveRecordWriter(jc, outPath, valueClass, isCompressed, tableProp, null); } return null; } public static PartitionDesc getPartitionDescFromPathRecursively( Map<String, PartitionDesc> pathToPartitionInfo, Path dir, Map<Map<String, PartitionDesc>, Map<String, PartitionDesc>> cacheMap) throws IOException { return getPartitionDescFromPathRecursively(pathToPartitionInfo, dir, cacheMap, false); } public static PartitionDesc getPartitionDescFromPathRecursively( Map<String, PartitionDesc> pathToPartitionInfo, Path dir, Map<Map<String, PartitionDesc>, Map<String, PartitionDesc>> cacheMap, boolean ignoreSchema) throws IOException { PartitionDesc part = doGetPartitionDescFromPath(pathToPartitionInfo, dir); if (part == null && (ignoreSchema || (dir.toUri().getScheme() == null || dir.toUri().getScheme().trim() .equals("")))) { Map<String, PartitionDesc> newPathToPartitionInfo = null; if (cacheMap != null) { newPathToPartitionInfo = cacheMap.get(pathToPartitionInfo); } if (newPathToPartitionInfo == null) { // still null newPathToPartitionInfo = new HashMap<String, PartitionDesc>(); populateNewPartitionDesc(pathToPartitionInfo, newPathToPartitionInfo); if (cacheMap != null) { cacheMap.put(pathToPartitionInfo, newPathToPartitionInfo); } } part = doGetPartitionDescFromPath(newPathToPartitionInfo, dir); } if (part != null) { return part; } else { throw new IOException("cannot find dir = " + dir.toString() + " in pathToPartitionInfo: " + pathToPartitionInfo.keySet()); } } private static void populateNewPartitionDesc( Map<String, PartitionDesc> pathToPartitionInfo, Map<String, PartitionDesc> newPathToPartitionInfo) { for (Map.Entry<String, PartitionDesc> entry: pathToPartitionInfo.entrySet()) { String entryKey = entry.getKey(); PartitionDesc partDesc = entry.getValue(); Path newP = new Path(entryKey); String pathOnly = newP.toUri().getPath(); newPathToPartitionInfo.put(pathOnly, partDesc); } } private static PartitionDesc doGetPartitionDescFromPath( Map<String, PartitionDesc> pathToPartitionInfo, Path dir) { // We first do exact match, and then do prefix matching. The latter is due to input dir // could be /dir/ds='2001-02-21'/part-03 where part-03 is not part of partition String dirPath = dir.toUri().getPath(); PartitionDesc part = pathToPartitionInfo.get(dir.toString()); if (part == null) { // LOG.warn("exact match not found, try ripping input path's theme and authority"); part = pathToPartitionInfo.get(dirPath); } if (part == null) { String dirStr = dir.toString(); int dirPathIndex = dirPath.lastIndexOf(Path.SEPARATOR); int dirStrIndex = dirStr.lastIndexOf(Path.SEPARATOR); while (dirPathIndex >= 0 && dirStrIndex >= 0) { dirStr = dirStr.substring(0, dirStrIndex); dirPath = dirPath.substring(0, dirPathIndex); //first try full match part = pathToPartitionInfo.get(dirStr); if (part == null) { // LOG.warn("exact match not found, try ripping input path's theme and authority"); part = pathToPartitionInfo.get(dirPath); } if (part != null) { break; } dirPathIndex = dirPath.lastIndexOf(Path.SEPARATOR); dirStrIndex = dirStr.lastIndexOf(Path.SEPARATOR); } } return part; } private static boolean foundAlias(Map<String, ArrayList<String>> pathToAliases, String path) { List<String> aliases = pathToAliases.get(path); if ((aliases == null) || (aliases.isEmpty())) { return false; } return true; } private static String getMatchingPath(Map<String, ArrayList<String>> pathToAliases, Path dir) { // First find the path to be searched String path = dir.toString(); if (foundAlias(pathToAliases, path)) { return path; } String dirPath = dir.toUri().getPath(); if (foundAlias(pathToAliases, dirPath)) { return dirPath; } path = dirPath; String dirStr = dir.toString(); int dirPathIndex = dirPath.lastIndexOf(Path.SEPARATOR); int dirStrIndex = dirStr.lastIndexOf(Path.SEPARATOR); while (dirPathIndex >= 0 && dirStrIndex >= 0) { dirStr = dirStr.substring(0, dirStrIndex); dirPath = dirPath.substring(0, dirPathIndex); //first try full match if (foundAlias(pathToAliases, dirStr)) { return dirStr; } if (foundAlias(pathToAliases, dirPath)) { return dirPath; } dirPathIndex = dirPath.lastIndexOf(Path.SEPARATOR); dirStrIndex = dirStr.lastIndexOf(Path.SEPARATOR); } return null; } /** * Get the list of operatators from the opeerator tree that are needed for the path * @param pathToAliases mapping from path to aliases * @param aliasToWork The operator tree to be invoked for a given alias * @param dir The path to look for **/ public static List<Operator<? extends Serializable>> doGetWorksFromPath( Map<String, ArrayList<String>> pathToAliases, Map<String, Operator<? extends Serializable>> aliasToWork, Path dir) { List<Operator<? extends Serializable>> opList = new ArrayList<Operator<? extends Serializable>>(); List<String> aliases = doGetAliasesFromPath(pathToAliases, dir); for (String alias : aliases) { opList.add(aliasToWork.get(alias)); } return opList; } /** * Get the list of aliases from the opeerator tree that are needed for the path * @param pathToAliases mapping from path to aliases * @param dir The path to look for **/ public static List<String> doGetAliasesFromPath( Map<String, ArrayList<String>> pathToAliases, Path dir) { if (pathToAliases == null) { return new ArrayList<String>(); } String path = getMatchingPath(pathToAliases, dir); return pathToAliases.get(path); } private HiveFileFormatUtils() { // prevent instantiation } }