/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.hive.hcatalog.mapreduce; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.metastore.IMetaStoreClient; import org.apache.hadoop.hive.metastore.MetaStoreUtils; import org.apache.hadoop.hive.ql.metadata.HiveStorageHandler; import org.apache.hadoop.hive.metastore.api.FieldSchema; import org.apache.hadoop.hive.metastore.api.MetaException; import org.apache.hadoop.hive.metastore.api.NoSuchObjectException; import org.apache.hadoop.hive.ql.metadata.Table; import org.apache.hadoop.hive.serde2.AbstractSerDe; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.OutputCommitter; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter; import org.apache.hadoop.util.ReflectionUtils; import org.apache.hive.hcatalog.common.ErrorType; import org.apache.hive.hcatalog.common.HCatConstants; import org.apache.hive.hcatalog.common.HCatException; import org.apache.hive.hcatalog.common.HCatUtil; import org.apache.hive.hcatalog.data.HCatRecord; import org.apache.thrift.TException; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Map; /** * File-based storage (ie RCFile, Text, etc) implementation of OutputFormatContainer. * This implementation supports the following HCatalog features: partitioning, dynamic partitioning, Hadoop Archiving, etc. */ class FileOutputFormatContainer extends OutputFormatContainer { /** * @param of base OutputFormat to contain */ public FileOutputFormatContainer(org.apache.hadoop.mapred.OutputFormat<? super WritableComparable<?>, ? super Writable> of) { super(of); } @Override public RecordWriter<WritableComparable<?>, HCatRecord> getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException { //this needs to be manually set, under normal circumstances MR Task does this setWorkOutputPath(context); //Configure the output key and value classes. // This is required for writing null as key for file based tables. context.getConfiguration().set("mapred.output.key.class", NullWritable.class.getName()); String jobInfoString = context.getConfiguration().get( HCatConstants.HCAT_KEY_OUTPUT_INFO); OutputJobInfo jobInfo = (OutputJobInfo) HCatUtil .deserialize(jobInfoString); StorerInfo storeInfo = jobInfo.getTableInfo().getStorerInfo(); HiveStorageHandler storageHandler = HCatUtil.getStorageHandler( context.getConfiguration(), storeInfo); Class<? extends AbstractSerDe> serde = storageHandler.getSerDeClass(); AbstractSerDe sd = (AbstractSerDe) ReflectionUtils.newInstance(serde, context.getConfiguration()); context.getConfiguration().set("mapred.output.value.class", sd.getSerializedClass().getName()); RecordWriter<WritableComparable<?>, HCatRecord> rw; if (HCatBaseOutputFormat.getJobInfo(context.getConfiguration()).isDynamicPartitioningUsed()){ // When Dynamic partitioning is used, the RecordWriter instance initialized here isn't used. Can use null. // (That's because records can't be written until the values of the dynamic partitions are deduced. // By that time, a new local instance of RecordWriter, with the correct output-path, will be constructed.) rw = new DynamicPartitionFileRecordWriterContainer( (org.apache.hadoop.mapred.RecordWriter)null, context); } else { Path parentDir = new Path(context.getConfiguration().get("mapred.work.output.dir")); Path childPath = new Path(parentDir,FileOutputFormat.getUniqueName(new JobConf(context.getConfiguration()), context.getConfiguration().get("mapreduce.output.basename", "part"))); rw = new StaticPartitionFileRecordWriterContainer( getBaseOutputFormat().getRecordWriter( parentDir.getFileSystem(context.getConfiguration()), new JobConf(context.getConfiguration()), childPath.toString(), InternalUtil.createReporter(context)), context); } return rw; } @Override public void checkOutputSpecs(JobContext context) throws IOException, InterruptedException { OutputJobInfo jobInfo = HCatOutputFormat.getJobInfo(context.getConfiguration()); IMetaStoreClient client = null; try { HiveConf hiveConf = HCatUtil.getHiveConf(context.getConfiguration()); client = HCatUtil.getHiveMetastoreClient(hiveConf); handleDuplicatePublish(context, jobInfo, client, new Table(jobInfo.getTableInfo().getTable())); } catch (MetaException e) { throw new IOException(e); } catch (TException e) { throw new IOException(e); } finally { HCatUtil.closeHiveClientQuietly(client); } if (!jobInfo.isDynamicPartitioningUsed()) { JobConf jobConf = new JobConf(context.getConfiguration()); getBaseOutputFormat().checkOutputSpecs(null, jobConf); //checkoutputspecs might've set some properties we need to have context reflect that HCatUtil.copyConf(jobConf, context.getConfiguration()); } } @Override public OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException, InterruptedException { //this needs to be manually set, under normal circumstances MR Task does this setWorkOutputPath(context); return new FileOutputCommitterContainer(context, HCatBaseOutputFormat.getJobInfo(context.getConfiguration()).isDynamicPartitioningUsed() ? null : new JobConf(context.getConfiguration()).getOutputCommitter()); } /** * Handles duplicate publish of partition or data into an unpartitioned table * if the table is immutable * * For partitioned tables, fails if partition already exists. * For non partitioned tables, fails if files are present in table directory. * For dynamic partitioned publish, does nothing - check would need to be done at recordwriter time * @param context the job * @param outputInfo the output info * @param client the metastore client * @param table the table being written to * @throws IOException * @throws org.apache.hadoop.hive.metastore.api.MetaException * @throws org.apache.thrift.TException */ private static void handleDuplicatePublish(JobContext context, OutputJobInfo outputInfo, IMetaStoreClient client, Table table) throws IOException, MetaException, TException, NoSuchObjectException { /* * For fully specified ptn, follow strict checks for existence of partitions in metadata * For unpartitioned tables, follow filechecks * For partially specified tables: * This would then need filechecks at the start of a ptn write, * Doing metadata checks can get potentially very expensive (fat conf) if * there are a large number of partitions that match the partial specifications */ if (!table.isImmutable()){ return; } if (table.getPartitionKeys().size() > 0) { if (!outputInfo.isDynamicPartitioningUsed()) { List<String> partitionValues = getPartitionValueList( table, outputInfo.getPartitionValues()); // fully-specified partition List<String> currentParts = client.listPartitionNames(outputInfo.getDatabaseName(), outputInfo.getTableName(), partitionValues, (short) 1); if (currentParts.size() > 0) { // If a table is partitioned and immutable, then the presence // of the partition alone is enough to throw an error - we do // not need to check for emptiness to decide to throw an error throw new HCatException(ErrorType.ERROR_DUPLICATE_PARTITION); } } } else { List<String> partitionValues = getPartitionValueList( table, outputInfo.getPartitionValues()); // non-partitioned table Path tablePath = new Path(table.getTTable().getSd().getLocation()); FileSystem fs = tablePath.getFileSystem(context.getConfiguration()); if (!MetaStoreUtils.isDirEmpty(fs,tablePath)){ throw new HCatException(ErrorType.ERROR_NON_EMPTY_TABLE, table.getDbName() + "." + table.getTableName()); } } } /** * Convert the partition value map to a value list in the partition key order. * @param table the table being written to * @param valueMap the partition value map * @return the partition value list * @throws java.io.IOException */ static List<String> getPartitionValueList(Table table, Map<String, String> valueMap) throws IOException { if (valueMap.size() != table.getPartitionKeys().size()) { throw new HCatException(ErrorType.ERROR_INVALID_PARTITION_VALUES, "Table " + table.getTableName() + " has " + table.getPartitionKeys().size() + " partition keys, got " + valueMap.size()); } List<String> values = new ArrayList<String>(); for (FieldSchema schema : table.getPartitionKeys()) { String value = valueMap.get(schema.getName().toLowerCase()); if (value == null) { throw new HCatException(ErrorType.ERROR_MISSING_PARTITION_KEY, "Key " + schema.getName() + " of table " + table.getTableName()); } values.add(value); } return values; } static void setWorkOutputPath(TaskAttemptContext context) throws IOException { String outputPath = context.getConfiguration().get("mapred.output.dir"); //we need to do this to get the task path and set it for mapred implementation //since it can't be done automatically because of mapreduce->mapred abstraction if (outputPath != null) context.getConfiguration().set("mapred.work.output.dir", new FileOutputCommitter(new Path(outputPath), context).getWorkPath().toString()); } }