/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.hive.hcatalog.mapreduce; import java.io.File; import java.io.IOException; import java.io.OutputStream; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Random; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.metastore.HiveMetaStoreClient; import org.apache.hadoop.hive.metastore.MetaStoreUtils; import org.apache.hadoop.hive.metastore.api.FieldSchema; import org.apache.hadoop.hive.metastore.api.NoSuchObjectException; import org.apache.hadoop.hive.metastore.api.SerDeInfo; import org.apache.hadoop.hive.metastore.api.StorageDescriptor; import org.apache.hadoop.hive.metastore.api.Table; import org.apache.hadoop.hive.ql.CompilationOpContext; import org.apache.hadoop.hive.ql.QueryState; import org.apache.hadoop.hive.ql.exec.FetchTask; import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.metadata.Hive; import org.apache.hadoop.hive.ql.metadata.Partition; import org.apache.hadoop.hive.ql.plan.FetchWork; import org.apache.hadoop.hive.ql.plan.PartitionDesc; import org.apache.hadoop.hive.ql.plan.TableDesc; import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MiniMRCluster; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hive.hcatalog.cli.SemanticAnalysis.HCatSemanticAnalyzer; import org.apache.hive.hcatalog.common.HCatException; import org.apache.hive.hcatalog.data.DefaultHCatRecord; import org.apache.hive.hcatalog.data.HCatRecord; import org.apache.hive.hcatalog.data.schema.HCatFieldSchema; import org.apache.hive.hcatalog.data.schema.HCatSchema; import org.apache.hive.hcatalog.data.schema.HCatSchemaUtils; import org.apache.hive.hcatalog.mapreduce.MultiOutputFormat.JobConfigurer; import org.junit.AfterClass; import org.junit.Assert; import org.junit.BeforeClass; import org.junit.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class TestHCatMultiOutputFormat { private static final Logger LOG = LoggerFactory.getLogger(TestHCatMultiOutputFormat.class); private static final String DATABASE = "default"; private static final String[] tableNames = {"test1", "test2", "test3"}; private static final String[] tablePerms = {"755", "750", "700"}; private static Path warehousedir = null; private static HashMap<String, HCatSchema> schemaMap = new HashMap<String, HCatSchema>(); private static HiveMetaStoreClient hmsc; private static MiniMRCluster mrCluster; private static Configuration mrConf; private static HiveConf hiveConf; private static File workDir; private static int msPort; private static Thread t; static { schemaMap.put(tableNames[0], new HCatSchema(ColumnHolder.hCattest1Cols)); schemaMap.put(tableNames[1], new HCatSchema(ColumnHolder.hCattest2Cols)); schemaMap.put(tableNames[2], new HCatSchema(ColumnHolder.hCattest3Cols)); } /** * Private class which holds all the data for the test cases */ private static class ColumnHolder { private static ArrayList<HCatFieldSchema> hCattest1Cols = new ArrayList<HCatFieldSchema>(); private static ArrayList<HCatFieldSchema> hCattest2Cols = new ArrayList<HCatFieldSchema>(); private static ArrayList<HCatFieldSchema> hCattest3Cols = new ArrayList<HCatFieldSchema>(); private static ArrayList<FieldSchema> partitionCols = new ArrayList<FieldSchema>(); private static ArrayList<FieldSchema> test1Cols = new ArrayList<FieldSchema>(); private static ArrayList<FieldSchema> test2Cols = new ArrayList<FieldSchema>(); private static ArrayList<FieldSchema> test3Cols = new ArrayList<FieldSchema>(); private static HashMap<String, List<FieldSchema>> colMapping = new HashMap<String, List<FieldSchema>>(); static { try { FieldSchema keyCol = new FieldSchema("key", serdeConstants.STRING_TYPE_NAME, ""); test1Cols.add(keyCol); test2Cols.add(keyCol); test3Cols.add(keyCol); hCattest1Cols.add(HCatSchemaUtils.getHCatFieldSchema(keyCol)); hCattest2Cols.add(HCatSchemaUtils.getHCatFieldSchema(keyCol)); hCattest3Cols.add(HCatSchemaUtils.getHCatFieldSchema(keyCol)); FieldSchema valueCol = new FieldSchema("value", serdeConstants.STRING_TYPE_NAME, ""); test1Cols.add(valueCol); test3Cols.add(valueCol); hCattest1Cols.add(HCatSchemaUtils.getHCatFieldSchema(valueCol)); hCattest3Cols.add(HCatSchemaUtils.getHCatFieldSchema(valueCol)); FieldSchema extraCol = new FieldSchema("extra", serdeConstants.STRING_TYPE_NAME, ""); test3Cols.add(extraCol); hCattest3Cols.add(HCatSchemaUtils.getHCatFieldSchema(extraCol)); colMapping.put("test1", test1Cols); colMapping.put("test2", test2Cols); colMapping.put("test3", test3Cols); } catch (HCatException e) { LOG.error("Error in setting up schema fields for the table", e); throw new RuntimeException(e); } } static { partitionCols.add(new FieldSchema("ds", serdeConstants.STRING_TYPE_NAME, "")); partitionCols.add(new FieldSchema("cluster", serdeConstants.STRING_TYPE_NAME, "")); } } @BeforeClass public static void setup() throws Exception { System.clearProperty("mapred.job.tracker"); String testDir = System.getProperty("test.tmp.dir", "./"); testDir = testDir + "/test_multitable_" + Math.abs(new Random().nextLong()) + "/"; workDir = new File(new File(testDir).getCanonicalPath()); FileUtil.fullyDelete(workDir); workDir.mkdirs(); warehousedir = new Path(System.getProperty("test.warehouse.dir")); HiveConf metastoreConf = new HiveConf(); metastoreConf.setVar(HiveConf.ConfVars.METASTOREWAREHOUSE, warehousedir.toString()); // Run hive metastore server msPort = MetaStoreUtils.startMetaStore(metastoreConf); // LocalJobRunner does not work with mapreduce OutputCommitter. So need // to use MiniMRCluster. MAPREDUCE-2350 Configuration conf = new Configuration(true); conf.set("yarn.scheduler.capacity.root.queues", "default"); conf.set("yarn.scheduler.capacity.root.default.capacity", "100"); FileSystem fs = FileSystem.get(conf); System.setProperty("hadoop.log.dir", new File(workDir, "/logs").getAbsolutePath()); mrCluster = new MiniMRCluster(1, fs.getUri().toString(), 1, null, null, new JobConf(conf)); mrConf = mrCluster.createJobConf(); initializeSetup(); warehousedir.getFileSystem(conf).mkdirs(warehousedir); } private static void initializeSetup() throws Exception { hiveConf = new HiveConf(mrConf, TestHCatMultiOutputFormat.class); hiveConf.setVar(HiveConf.ConfVars.METASTOREURIS, "thrift://localhost:" + msPort); hiveConf.setIntVar(HiveConf.ConfVars.METASTORETHRIFTCONNECTIONRETRIES, 3); hiveConf.setIntVar(HiveConf.ConfVars.METASTORETHRIFTFAILURERETRIES, 3); hiveConf.set(HiveConf.ConfVars.SEMANTIC_ANALYZER_HOOK.varname, HCatSemanticAnalyzer.class.getName()); hiveConf.set(HiveConf.ConfVars.PREEXECHOOKS.varname, ""); hiveConf.set(HiveConf.ConfVars.POSTEXECHOOKS.varname, ""); hiveConf.set(HiveConf.ConfVars.HIVE_SUPPORT_CONCURRENCY.varname, "false"); System.setProperty(HiveConf.ConfVars.PREEXECHOOKS.varname, " "); System.setProperty(HiveConf.ConfVars.POSTEXECHOOKS.varname, " "); hiveConf.set(HiveConf.ConfVars.METASTOREWAREHOUSE.varname, warehousedir.toString()); try { hmsc = new HiveMetaStoreClient(hiveConf); initalizeTables(); } catch (Throwable e) { LOG.error("Exception encountered while setting up testcase", e); throw new Exception(e); } finally { hmsc.close(); } } private static void initalizeTables() throws Exception { for (String table : tableNames) { try { if (hmsc.getTable(DATABASE, table) != null) { hmsc.dropTable(DATABASE, table); } } catch (NoSuchObjectException ignored) { } } for (int i = 0; i < tableNames.length; i++) { createTable(tableNames[i], tablePerms[i]); } } private static void createTable(String tableName, String tablePerm) throws Exception { Table tbl = new Table(); tbl.setDbName(DATABASE); tbl.setTableName(tableName); StorageDescriptor sd = new StorageDescriptor(); sd.setCols(ColumnHolder.colMapping.get(tableName)); tbl.setSd(sd); sd.setParameters(new HashMap<String, String>()); sd.setSerdeInfo(new SerDeInfo()); sd.getSerdeInfo().setName(tbl.getTableName()); sd.getSerdeInfo().setParameters(new HashMap<String, String>()); sd.setInputFormat(org.apache.hadoop.hive.ql.io.RCFileInputFormat.class.getName()); sd.setOutputFormat(org.apache.hadoop.hive.ql.io.RCFileOutputFormat.class.getName()); sd.getSerdeInfo().getParameters().put(serdeConstants.SERIALIZATION_FORMAT, "1"); sd.getSerdeInfo().setSerializationLib( org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe.class.getName()); tbl.setPartitionKeys(ColumnHolder.partitionCols); hmsc.createTable(tbl); Path path = new Path(warehousedir, tableName); FileSystem fs = path.getFileSystem(hiveConf); fs.setPermission(path, new FsPermission(tablePerm)); } @AfterClass public static void tearDown() throws IOException { FileUtil.fullyDelete(workDir); FileSystem fs = warehousedir.getFileSystem(hiveConf); if (fs.exists(warehousedir)) { fs.delete(warehousedir, true); } if (mrCluster != null) { mrCluster.shutdown(); } } /** * Simple test case. * <ol> * <li>Submits a mapred job which writes out one fixed line to each of the tables</li> * <li>uses hive fetch task to read the data and see if it matches what was written</li> * </ol> * * @throws Exception if any error occurs */ @Test public void testOutputFormat() throws Throwable { HashMap<String, String> partitionValues = new HashMap<String, String>(); partitionValues.put("ds", "1"); partitionValues.put("cluster", "ag"); ArrayList<OutputJobInfo> infoList = new ArrayList<OutputJobInfo>(); infoList.add(OutputJobInfo.create("default", tableNames[0], partitionValues)); infoList.add(OutputJobInfo.create("default", tableNames[1], partitionValues)); infoList.add(OutputJobInfo.create("default", tableNames[2], partitionValues)); Job job = new Job(hiveConf, "SampleJob"); job.setMapperClass(MyMapper.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(MultiOutputFormat.class); job.setNumReduceTasks(0); JobConfigurer configurer = MultiOutputFormat.createConfigurer(job); for (int i = 0; i < tableNames.length; i++) { configurer.addOutputFormat(tableNames[i], HCatOutputFormat.class, BytesWritable.class, HCatRecord.class); HCatOutputFormat.setOutput(configurer.getJob(tableNames[i]), infoList.get(i)); HCatOutputFormat.setSchema(configurer.getJob(tableNames[i]), schemaMap.get(tableNames[i])); } configurer.configure(); Path filePath = createInputFile(); FileInputFormat.addInputPath(job, filePath); Assert.assertTrue(job.waitForCompletion(true)); ArrayList<String> outputs = new ArrayList<String>(); for (String tbl : tableNames) { outputs.add(getTableData(tbl, "default").get(0)); } Assert.assertEquals("Comparing output of table " + tableNames[0] + " is not correct", outputs.get(0), "a,a,1,ag"); Assert.assertEquals("Comparing output of table " + tableNames[1] + " is not correct", outputs.get(1), "a,1,ag"); Assert.assertEquals("Comparing output of table " + tableNames[2] + " is not correct", outputs.get(2), "a,a,extra,1,ag"); // Check permisssion on partition dirs and files created for (int i = 0; i < tableNames.length; i++) { Path partitionFile = new Path(warehousedir + "/" + tableNames[i] + "/ds=1/cluster=ag/part-m-00000"); FileSystem fs = partitionFile.getFileSystem(mrConf); Assert.assertEquals("File permissions of table " + tableNames[i] + " is not correct", fs.getFileStatus(partitionFile).getPermission(), new FsPermission(tablePerms[i])); Assert.assertEquals("File permissions of table " + tableNames[i] + " is not correct", fs.getFileStatus(partitionFile.getParent()).getPermission(), new FsPermission(tablePerms[i])); Assert.assertEquals("File permissions of table " + tableNames[i] + " is not correct", fs.getFileStatus(partitionFile.getParent().getParent()).getPermission(), new FsPermission(tablePerms[i])); } LOG.info("File permissions verified"); } /** * Create a input file for map * * @return absolute path of the file. * @throws IOException if any error encountered */ private Path createInputFile() throws IOException { Path f = new Path(workDir + "/MultiTableInput.txt"); FileSystem fs = FileSystem.get(mrConf); if (fs.exists(f)) { fs.delete(f, true); } OutputStream out = fs.create(f); for (int i = 0; i < 3; i++) { out.write("a,a\n".getBytes()); } out.close(); return f; } /** * Method to fetch table data * * @param table table name * @param database database * @return list of columns in comma seperated way * @throws Exception if any error occurs */ private List<String> getTableData(String table, String database) throws Exception { QueryState queryState = new QueryState.Builder().build(); HiveConf conf = queryState.getConf(); conf.addResource("hive-site.xml"); ArrayList<String> results = new ArrayList<String>(); ArrayList<String> temp = new ArrayList<String>(); Hive hive = Hive.get(conf); org.apache.hadoop.hive.ql.metadata.Table tbl = hive.getTable(database, table); FetchWork work; if (!tbl.getPartCols().isEmpty()) { List<Partition> partitions = hive.getPartitions(tbl); List<PartitionDesc> partDesc = new ArrayList<PartitionDesc>(); List<Path> partLocs = new ArrayList<Path>(); TableDesc tableDesc = Utilities.getTableDesc(tbl); for (Partition part : partitions) { partLocs.add(part.getDataLocation()); partDesc.add(Utilities.getPartitionDescFromTableDesc(tableDesc, part, true)); } work = new FetchWork(partLocs, partDesc, tableDesc); work.setLimit(100); } else { work = new FetchWork(tbl.getDataLocation(), Utilities.getTableDesc(tbl)); } FetchTask task = new FetchTask(); task.setWork(work); task.initialize(queryState, null, null, new CompilationOpContext()); task.fetch(temp); for (String str : temp) { results.add(str.replace("\t", ",")); } return results; } private static class MyMapper extends Mapper<LongWritable, Text, BytesWritable, HCatRecord> { private int i = 0; @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { HCatRecord record = null; String[] splits = value.toString().split(","); switch (i) { case 0: record = new DefaultHCatRecord(2); record.set(0, splits[0]); record.set(1, splits[1]); break; case 1: record = new DefaultHCatRecord(1); record.set(0, splits[0]); break; case 2: record = new DefaultHCatRecord(3); record.set(0, splits[0]); record.set(1, splits[1]); record.set(2, "extra"); break; default: Assert.fail("This should not happen!!!!!"); } MultiOutputFormat.write(tableNames[i], null, record, context); i++; } } }