/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.io.orc; import static junit.framework.Assert.assertEquals; import static junit.framework.Assert.assertTrue; import static junit.framework.Assert.assertFalse; import java.io.BufferedReader; import java.io.File; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Random; import java.util.UUID; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.io.orc.OrcSerde.OrcSerdeRow; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hive.common.util.HiveTestUtils; import org.junit.Before; import org.junit.Rule; import org.junit.Test; import org.junit.rules.TestName; public class TestNewInputOutputFormat { Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test" + File.separator + "tmp")); Configuration conf; FileSystem localFs; @Before public void setup() throws Exception { conf = new Configuration(); conf.set("mapred.job.tracker", "local"); conf.set("fs.default.name", "local"); localFs = FileSystem.get(conf); } @Rule public TestName testCaseName = new TestName(); public static class OrcTestMapper1 extends Mapper<Object, Writable, Text, Text> { @Override public void map(Object key, Writable value, Context context) throws IOException, InterruptedException { context.write(null, new Text(value.toString())); } } @Test // Test regular inputformat public void testNewInputFormat() throws Exception { Job job = new Job(conf, "orc test"); job.setInputFormatClass(OrcNewInputFormat.class); job.setJarByClass(TestNewInputOutputFormat.class); job.setMapperClass(OrcTestMapper1.class); job.setNumReduceTasks(0); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(HiveTestUtils.getFileFromClasspath("orc-file-11-format.orc"))); Path outputPath = new Path(workDir, "TestOrcFile." + testCaseName.getMethodName() + ".txt"); localFs.delete(outputPath, true); FileOutputFormat.setOutputPath(job, outputPath); boolean result = job.waitForCompletion(true); assertTrue(result); Path outputFilePath = new Path(outputPath, "part-m-00000"); assertTrue(localFs.exists(outputFilePath)); BufferedReader reader = new BufferedReader( new InputStreamReader(localFs.open(outputFilePath))); int count=0; String line; String lastLine=null; while ((line=reader.readLine()) != null) { count++; lastLine = line; } reader.close(); assertEquals(count, 7500); assertEquals(lastLine, "{true, 100, 2048, 65536," + " 9223372036854775807, 2.0, -5.0" + ", , bye, {[{1, bye}, {2, sigh}]}, [{100000000, cat}," + " {-100000, in}, {1234, hat}]," + " {chani={5, chani}, mauddib={1, mauddib}}," + " 2000-03-12 15:00:01, 12345678.6547457}"); localFs.delete(outputPath, true); } public static class OrcTestMapper2 extends Mapper<Object, Text, Object, Writable> { private final TypeInfo typeInfo = TypeInfoUtils .getTypeInfoFromTypeString("struct<a:int,b:string>"); private final ObjectInspector oip = TypeInfoUtils .getStandardJavaObjectInspectorFromTypeInfo(typeInfo); private final OrcSerde serde = new OrcSerde(); private Writable row; @Override public void map(Object key, Text value, Context context) throws IOException, InterruptedException { String[] items = value.toString().split(","); List<Object> struct = new ArrayList<Object>(2); struct.add(0, Integer.parseInt(items[0])); struct.add(1, items[1]); row = serde.serialize(struct, oip); context.write(null, row); } } @Test //Test regular outputformat public void testNewOutputFormat() throws Exception { int rownum=1000; Path inputPath = new Path(workDir, "TestOrcFile." + testCaseName.getMethodName() + ".txt"); Path outputPath = new Path(workDir, "TestOrcFile." + testCaseName.getMethodName() + ".orc"); localFs.delete(outputPath, true); PrintWriter pw = new PrintWriter( new OutputStreamWriter(localFs.create(inputPath))); Random r = new Random(1000L); boolean firstRow = true; int firstIntValue = 0; String firstStringValue = null; for (int i=0;i<rownum;i++) { int intValue = r.nextInt(); String stringValue = UUID.randomUUID().toString(); if (firstRow) { firstRow = false; firstIntValue = intValue; firstStringValue = stringValue; } pw.println(intValue + "," + stringValue); } pw.close(); Job job = new Job(conf, "orc test"); job.setOutputFormatClass(OrcNewOutputFormat.class); job.setJarByClass(TestNewInputOutputFormat.class); job.setMapperClass(OrcTestMapper2.class); job.setNumReduceTasks(0); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Writable.class); FileInputFormat.addInputPath(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); boolean result = job.waitForCompletion(true); assertTrue(result); Path outputFilePath = new Path(outputPath, "part-m-00000"); assertTrue(localFs.exists(outputFilePath)); Reader reader = OrcFile.createReader(outputFilePath, OrcFile.readerOptions(conf).filesystem(localFs)); assertTrue(reader.getNumberOfRows() == rownum); assertEquals(reader.getCompression(), CompressionKind.ZLIB); StructObjectInspector soi = (StructObjectInspector)reader.getObjectInspector(); StructTypeInfo ti = (StructTypeInfo)TypeInfoUtils.getTypeInfoFromObjectInspector(soi); assertEquals(((PrimitiveTypeInfo)ti.getAllStructFieldTypeInfos().get(0)) .getPrimitiveCategory(), PrimitiveObjectInspector.PrimitiveCategory.INT); assertEquals(((PrimitiveTypeInfo)ti.getAllStructFieldTypeInfos().get(1)) .getPrimitiveCategory(), PrimitiveObjectInspector.PrimitiveCategory.STRING); RecordReader rows = reader.rows(); Object row = rows.next(null); IntWritable intWritable = (IntWritable)soi.getStructFieldData(row, soi.getAllStructFieldRefs().get(0)); Text text = (Text)soi.getStructFieldData(row, soi.getAllStructFieldRefs().get(1)); assertEquals(intWritable.get(), firstIntValue); assertEquals(text.toString(), firstStringValue); localFs.delete(outputPath, true); } @Test //Test outputformat with compression public void testNewOutputFormatWithCompression() throws Exception { conf.set("hive.exec.orc.default.compress", "SNAPPY"); Path inputPath = new Path(workDir, "TestOrcFile." + testCaseName.getMethodName() + ".txt"); Path outputPath = new Path(workDir, "TestOrcFile." + testCaseName.getMethodName() + ".orc"); localFs.delete(outputPath, true); PrintWriter pw = new PrintWriter( new OutputStreamWriter(localFs.create(inputPath))); pw.println("1,hello"); pw.println("2,world"); pw.close(); Job job = new Job(conf, "orc test"); job.setOutputFormatClass(OrcNewOutputFormat.class); job.setJarByClass(TestNewInputOutputFormat.class); job.setMapperClass(OrcTestMapper2.class); job.setNumReduceTasks(0); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(OrcSerdeRow.class); FileInputFormat.addInputPath(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); boolean result = job.waitForCompletion(true); assertTrue(result); Path outputFilePath = new Path(outputPath, "part-m-00000"); Reader reader = OrcFile.createReader(outputFilePath, OrcFile.readerOptions(conf).filesystem(localFs)); assertEquals(reader.getCompression(), CompressionKind.SNAPPY); localFs.delete(outputPath, true); } public static class OrcTestMapper3 extends Mapper<Object, Text, IntWritable, Text> { @Override public void map(Object key, Text value, Context context) throws IOException, InterruptedException { String items[] = value.toString().split("\\s+"); context.write(new IntWritable(items.length), value); } } public static class OrcTestReducer3 extends Reducer<IntWritable, Text, NullWritable, Writable> { final static TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString( "struct<length:int,count:int,list:array" + "<struct<lastword:string,lastwordlength:int>>," + "wordcounts:map<string,int>>"); private final ObjectInspector oip = TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo(typeInfo); private final OrcSerde serde = new OrcSerde(); private Writable row; @Override public void reduce(IntWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException { List<String> lastwords = new ArrayList<String>(); Map<String, Integer> wordCounts = new HashMap<String, Integer>(); int count = 0; for (Text val : values) { String[] items = val.toString().toLowerCase().split("\\s+"); lastwords.add(items[items.length-1]); for (String item : items) { if (wordCounts.containsKey(item)) { wordCounts.put(item, wordCounts.get(item)+1); } else { wordCounts.put(item, 1); } } count++; } List<Object> struct = new ArrayList<Object>(4); struct.add(0, key.get()); struct.add(1, count); List<List<Object>> lastWordInfoList = new ArrayList<List<Object>>(); Collections.sort(lastwords); for (String word : lastwords) { List<Object> info = new ArrayList<Object>(2); info.add(0, word); info.add(1, word.length()); lastWordInfoList.add(info); } struct.add(2, lastWordInfoList); struct.add(3, wordCounts); row = serde.serialize(struct, oip); context.write(NullWritable.get(), row); } } @SuppressWarnings("unchecked") @Test //Test outputformat with complex data type, and with reduce public void testNewOutputFormatComplex() throws Exception { Path inputPath = new Path(workDir, "TestOrcFile." + testCaseName.getMethodName() + ".txt"); Path outputPath = new Path(workDir, "TestOrcFile." + testCaseName.getMethodName() + ".orc"); localFs.delete(outputPath, true); PrintWriter pw = new PrintWriter( new OutputStreamWriter(localFs.create(inputPath))); pw.println("I have eaten"); pw.println("the plums"); pw.println("that were in"); pw.println("the icebox"); pw.println("and which"); pw.println("you were probably"); pw.println("saving"); pw.println("for breakfast"); pw.println("Forgive me"); pw.println("they were delicious"); pw.println("so sweet"); pw.println("and so cold"); pw.close(); Job job = new Job(conf, "orc test"); job.setOutputFormatClass(OrcNewOutputFormat.class); job.setJarByClass(TestNewInputOutputFormat.class); job.setMapperClass(OrcTestMapper3.class); job.setReducerClass(OrcTestReducer3.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(OrcSerdeRow.class); FileInputFormat.addInputPath(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); boolean result = job.waitForCompletion(true); assertTrue(result); Path outputFilePath = new Path(outputPath, "part-r-00000"); Reader reader = OrcFile.createReader(outputFilePath, OrcFile.readerOptions(conf).filesystem(localFs)); RecordReader rows = reader.rows(); ObjectInspector orcOi = reader.getObjectInspector(); ObjectInspector stoi = TypeInfoUtils .getStandardJavaObjectInspectorFromTypeInfo(OrcTestReducer3.typeInfo); ObjectInspectorConverters.Converter converter = ObjectInspectorConverters .getConverter(orcOi, stoi); Object row = rows.next(null); List<Object> converted = (List<Object>)converter.convert(row); assertEquals(1, converted.get(0)); assertEquals(1, converted.get(1)); List<Object> list = (List<Object>)converted.get(2); assertEquals(list.size(), 1); assertEquals("saving", ((List<Object>)list.get(0)).get(0)); assertEquals(6, ((List<Object>)list.get(0)).get(1)); Map<String, Integer> map = (Map<String, Integer>)converted.get(3); assertEquals(map.size(), 1); assertEquals(map.get("saving"), new Integer(1)); row = rows.next(null); converted = (List<Object>)converter.convert(row); assertEquals(2, converted.get(0)); assertEquals(6, converted.get(1)); list = (List<Object>)converted.get(2); assertEquals(list.size(), 6); assertEquals("breakfast", ((List<Object>)list.get(0)).get(0)); assertEquals(9, ((List<Object>)list.get(0)).get(1)); map = (Map<String, Integer>)converted.get(3); assertEquals(map.size(), 11); assertEquals(map.get("the"), new Integer(2)); row = rows.next(null); converted = (List<Object>)converter.convert(row); assertEquals(3, converted.get(0)); assertEquals(5, converted.get(1)); list = (List<Object>)converted.get(2); assertEquals(list.size(), 5); assertEquals("cold", ((List<Object>)list.get(0)).get(0)); assertEquals(4, ((List<Object>)list.get(0)).get(1)); map = (Map<String, Integer>)converted.get(3); assertEquals(map.size(), 13); assertEquals(map.get("were"), new Integer(3)); assertFalse(rows.hasNext()); localFs.delete(outputPath, true); } @Test // Test inputformat with column prune public void testNewInputFormatPruning() throws Exception { conf.set("hive.io.file.read.all.columns", "false"); conf.set("hive.io.file.readcolumn.ids", "1,3"); Job job = new Job(conf, "orc test"); job.setInputFormatClass(OrcNewInputFormat.class); job.setJarByClass(TestNewInputOutputFormat.class); job.setMapperClass(OrcTestMapper1.class); job.setNumReduceTasks(0); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(HiveTestUtils .getFileFromClasspath("orc-file-11-format.orc"))); Path outputPath = new Path(workDir, "TestOrcFile." + testCaseName.getMethodName() + ".txt"); localFs.delete(outputPath, true); FileOutputFormat.setOutputPath(job, outputPath); boolean result = job.waitForCompletion(true); assertTrue(result); Path outputFilePath = new Path(outputPath, "part-m-00000"); BufferedReader reader = new BufferedReader( new InputStreamReader(localFs.open(outputFilePath))); String line=reader.readLine(); assertEquals(line, "{null, 1, null, 65536, null, null, null, " + "null, null, null, null, null, null, null}"); localFs.delete(outputPath, true); } }