/** * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hive.benchmark.storage; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter; import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx; import org.apache.hadoop.hive.ql.io.HiveOutputFormat; import org.apache.hadoop.hive.ql.io.IOConstants; import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat; import org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat; import org.apache.hadoop.hive.ql.io.orc.OrcSerde; import org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat; import org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat; import org.apache.hadoop.hive.ql.io.parquet.read.DataWritableReadSupport; import org.apache.hadoop.hive.ql.io.parquet.serde.ArrayWritableObjectInspector; import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe; import org.apache.hadoop.hive.ql.io.parquet.vector.VectorizedParquetRecordReader; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.MapWork; import org.apache.hadoop.hive.serde2.AbstractSerDe; import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.hive.serde2.SerDeUtils; import org.apache.hadoop.hive.serde2.io.DoubleWritable; import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.ArrayWritable; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.BooleanWritable; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.mapred.FileSplit; import org.apache.hadoop.mapred.InputFormat; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.Job; import org.apache.parquet.hadoop.ParquetInputFormat; import org.apache.parquet.hadoop.ParquetInputSplit; import org.apache.parquet.hadoop.api.ReadSupport; import org.apache.parquet.hadoop.example.GroupReadSupport; import org.openjdk.jmh.annotations.Param; import org.openjdk.jmh.annotations.Setup; import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.State; import org.openjdk.jmh.annotations.Benchmark; import org.openjdk.jmh.annotations.Level; import org.openjdk.jmh.annotations.Scope; import org.openjdk.jmh.runner.Runner; import org.openjdk.jmh.runner.options.Options; import org.openjdk.jmh.runner.options.OptionsBuilder; import java.io.File; import java.io.IOException; import java.net.URI; import java.util.Properties; import java.util.List; import java.util.Random; import java.util.ArrayList; import java.util.Arrays; @State(Scope.Benchmark) public class ColumnarStorageBench { /** * This test measures the performance between different columnar storage formats used * by Hive. If you need to add more formats, see the 'format' gobal variable to add * a new one on the list, and create a class that implements StorageFormatTest interface. * * This test uses JMH framework for benchmarking. * You may execute this benchmark tool using JMH command line in different ways: * * To use the settings shown in the main() function, use: * $ java -cp target/benchmarks.jar org.apache.hive.benchmark.storage.ColumnarStorageBench * * To use the default settings used by JMH, use: * $ java -jar target/benchmarks.jar org.apache.hive.benchmark.storage ColumnStorageBench * * To specify different parameters, use: * - This command will use 10 warm-up iterations, 5 test iterations, and 2 forks. And it will * display the Average Time (avgt) in Microseconds (us) * - Benchmark mode. Available modes are: * [Throughput/thrpt, AverageTime/avgt, SampleTime/sample, SingleShotTime/ss, All/all] * - Output time unit. Available time units are: [m, s, ms, us, ns]. * * $ java -jar target/benchmarks.jar org.apache.hive.benchmark.storage ColumnStorageBench -wi 10 -i 5 -f 2 -bm avgt -tu us */ private static final String DEFAULT_TEMP_LOCATION = "/tmp"; private File writeFile, readFile, recordWriterFile; private Path writePath, readPath, recordWriterPath; private FileSystem fs; /** * LazySimpleSerDe is used to create our testing rows. */ private LazySimpleSerDe lazySimpleSerDe; /** * Contains implementation for the storage format to test */ private StorageFormatTest storageFormatTest; private RecordWriter recordWriter; private RecordReader recordReader; /** * These objects contains the record to be tested. */ private Writable recordWritable[]; private Object rows[]; private ObjectInspector oi; /** * These column types are used for the record that will be tested. */ private Properties recordProperties; private String DEFAULT_COLUMN_TYPES = "int,double,boolean,string,array<int>,map<string,string>,struct<a:int,b:int>"; public ColumnarStorageBench() { recordProperties = new Properties(); recordProperties.setProperty("columns", getColumnNames(DEFAULT_COLUMN_TYPES)); recordProperties.setProperty("columns.types", DEFAULT_COLUMN_TYPES); final int NUMBER_OF_ROWS_TO_TEST = 100; rows = new Object[NUMBER_OF_ROWS_TO_TEST]; recordWritable = new Writable[NUMBER_OF_ROWS_TO_TEST]; /** * We use LazySimpleSerDe to generate our testing rows. */ try { lazySimpleSerDe = new LazySimpleSerDe(); SerDeUtils.initializeSerDe(lazySimpleSerDe, new Configuration(), recordProperties, null); oi = lazySimpleSerDe.getObjectInspector(); for (int i = 0; i < NUMBER_OF_ROWS_TO_TEST; i++) { rows[i] = createRandomRow(DEFAULT_COLUMN_TYPES); } } catch(SerDeException e) { e.printStackTrace(); } } private String getColumnNames(final String columnTypes) { StringBuilder columnNames = new StringBuilder(); /* Construct a string of column names based on the number of column types */ List<TypeInfo> columnTypesList = TypeInfoUtils.getTypeInfosFromTypeString(columnTypes); for (int i=0; i < columnTypesList.size(); i++) { if (i > 0) { columnNames.append(","); } columnNames.append("c" + i); } return columnNames.toString(); } private long fileLength(Path path) throws IOException { return fs.getFileStatus(path).getLen(); } private ArrayWritable record(Writable... fields) { return new ArrayWritable(Writable.class, fields); } private Writable getPrimitiveWritable(final PrimitiveTypeInfo typeInfo) { Random rand = new Random(); switch (typeInfo.getPrimitiveCategory()) { case INT: return new IntWritable(rand.nextInt()); case DOUBLE: return new DoubleWritable(rand.nextDouble()); case BOOLEAN: return new BooleanWritable(rand.nextBoolean()); case CHAR: case VARCHAR: case STRING: byte b[] = new byte[30]; rand.nextBytes(b); return new BytesWritable(b); default: throw new IllegalArgumentException("Invalid primitive type: " + typeInfo.getTypeName()); } } private ArrayWritable createRecord(final List<TypeInfo> columnTypes) { Writable[] fields = new Writable[columnTypes.size()]; int pos=0; for (TypeInfo type : columnTypes) { switch (type.getCategory()) { case PRIMITIVE: fields[pos++] = getPrimitiveWritable((PrimitiveTypeInfo)type); break; case LIST: { List<TypeInfo> elementType = new ArrayList<TypeInfo>(); elementType.add(((ListTypeInfo) type).getListElementTypeInfo()); fields[pos++] = createRecord(elementType); } break; case MAP: { List<TypeInfo> keyValueType = new ArrayList<TypeInfo>(); keyValueType.add(((MapTypeInfo) type).getMapKeyTypeInfo()); keyValueType.add(((MapTypeInfo) type).getMapValueTypeInfo()); fields[pos++] = record(createRecord(keyValueType)); } break; case STRUCT: { List<TypeInfo> elementType = ((StructTypeInfo) type).getAllStructFieldTypeInfos(); fields[pos++] = createRecord(elementType); } break; default: throw new IllegalStateException("Invalid column type: " + type); } } return record(fields); } private ObjectInspector getArrayWritableObjectInspector(final String columnTypes) { List<TypeInfo> columnTypeList = TypeInfoUtils.getTypeInfosFromTypeString(columnTypes); List<String> columnNameList = Arrays.asList(getColumnNames(columnTypes).split(",")); StructTypeInfo rowTypeInfo = (StructTypeInfo)TypeInfoFactory.getStructTypeInfo(columnNameList, columnTypeList); return new ArrayWritableObjectInspector(rowTypeInfo); } private Object createRandomRow(final String columnTypes) throws SerDeException { Writable recordWritable = createRecord(TypeInfoUtils.getTypeInfosFromTypeString(columnTypes)); Writable simpleWritable = lazySimpleSerDe.serialize(recordWritable, getArrayWritableObjectInspector(columnTypes)); return lazySimpleSerDe.deserialize(simpleWritable); } /** * This class encapsulates all methods that will be called by each of the @Benchmark * methods. */ private class StorageFormatTest { private AbstractSerDe serDe; private JobConf jobConf; private HiveOutputFormat outputFormat; private InputFormat inputFormat; public StorageFormatTest(AbstractSerDe serDeImpl, HiveOutputFormat outputFormatImpl, InputFormat inputFormatImpl) throws SerDeException { jobConf = new JobConf(); serDe = serDeImpl; outputFormat = outputFormatImpl; inputFormat = inputFormatImpl; Configuration conf = new Configuration(); SerDeUtils.initializeSerDe(serDe, conf, recordProperties, null); } public Writable serialize(Object row, ObjectInspector oi) throws SerDeException { return serDe.serialize(row, oi); } public Object deserialize(Writable record) throws SerDeException { return serDe.deserialize(record); } /* We write many records because sometimes the RecordWriter for the format to test * behaves different with one record than a bunch of records */ public void writeRecords(RecordWriter writer, Writable records[]) throws IOException { for (int i=0; i < records.length; i++) { writer.write(records[i]); } } /* We read many records because sometimes the RecordReader for the format to test * behaves different with one record than a bunch of records */ public Object readRecords(RecordReader reader) throws IOException { Object alwaysNull = reader.createKey(); Object record = reader.createValue(); // Just loop through all values. We do not need to store anything though. // This is just for test purposes while (reader.next(alwaysNull, record)) ; return record; } public RecordWriter getRecordWriter(Path outputPath) throws IOException { return outputFormat.getHiveRecordWriter(jobConf, outputPath, null, false, recordProperties, null); } public RecordReader getRecordReader(Path inputPath) throws Exception { if ("parquet".equals(format) || "orc".equals(format)) { return inputFormat.getRecordReader( new FileSplit(inputPath, 0, fileLength(inputPath), (String[]) null), jobConf, null); } else if ("parquet-vec".equals(format)) { return getVectorizedRecordReader(inputPath); } else { throw new IllegalArgumentException("Invalid file format argument: " + format); } } public RecordReader getVectorizedRecordReader(Path inputPath) throws Exception { Configuration conf = new Configuration(); conf.set(IOConstants.COLUMNS, getColumnNames(DEFAULT_COLUMN_TYPES)); conf.set(IOConstants.COLUMNS_TYPES, DEFAULT_COLUMN_TYPES); // TODO: VectorizedParquetRecordReader doesn't support map, array now, the value of // ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR should be updated after support these data // types. conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false); conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0,1,2,3,6"); conf.set(ReadSupport.PARQUET_READ_SCHEMA, "test schema"); HiveConf.setBoolVar(conf, HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, true); HiveConf.setVar(conf, HiveConf.ConfVars.PLAN, "//tmp"); Job vectorJob = new Job(conf, "read vector"); ParquetInputFormat.setInputPaths(vectorJob, inputPath); ParquetInputFormat parquetInputFormat = new ParquetInputFormat(GroupReadSupport.class); ParquetInputSplit split = (ParquetInputSplit) parquetInputFormat.getSplits(vectorJob).get(0); initialVectorizedRowBatchCtx(conf); return new VectorizedParquetRecordReader(split, new JobConf(conf)); } } /** * This class is called to run I/O parquet tests. */ private class ParquetStorageFormatTest extends StorageFormatTest { public ParquetStorageFormatTest() throws SerDeException { super(new ParquetHiveSerDe(), new MapredParquetOutputFormat(), new MapredParquetInputFormat()); } } /** * This class is called to run i/o orc tests. */ private class OrcStorageFormatTest extends StorageFormatTest { public OrcStorageFormatTest() throws SerDeException { super(new OrcSerde(), new OrcOutputFormat(), new OrcInputFormat()); } } private File createTempFile() throws IOException { if (URI.create(DEFAULT_TEMP_LOCATION).getScheme() != null) { throw new IOException("Cannot create temporary files in a non-local file-system: Operation not permitted."); } File temp = File.createTempFile(this.toString(), null, new File(DEFAULT_TEMP_LOCATION)); temp.deleteOnExit(); temp.delete(); return temp; } // Test different format types @Param({"orc", "parquet", "parquet-vec"}) public String format; /** * Initializes resources that will be needed for each of the benchmark tests. * * @throws SerDeException If it cannot initialize the desired test format. * @throws IOException If it cannot write data to temporary files. */ @Setup(Level.Trial) public void prepareBenchmark() throws SerDeException, IOException { if (format.equalsIgnoreCase("parquet") || format.equalsIgnoreCase("parquet-vec")) { storageFormatTest = new ParquetStorageFormatTest(); } else if (format.equalsIgnoreCase("orc")) { storageFormatTest = new OrcStorageFormatTest(); } else { throw new IllegalArgumentException("Invalid file format argument: " + format); } for (int i=0; i < rows.length; i++) { recordWritable[i] = storageFormatTest.serialize(rows[i], oi); } fs = FileSystem.getLocal(new Configuration()); writeFile = createTempFile(); writePath = new Path(writeFile.getPath()); readFile = createTempFile(); readPath = new Path(readFile.getPath()); /* * Write a bunch of random rows that will be used for read benchmark. */ RecordWriter writer = storageFormatTest.getRecordWriter(readPath); storageFormatTest.writeRecords(writer, recordWritable); writer.close(false); } private void initialVectorizedRowBatchCtx(Configuration conf) throws HiveException { MapWork mapWork = new MapWork(); VectorizedRowBatchCtx rbCtx = new VectorizedRowBatchCtx(); rbCtx.init(createStructObjectInspector(conf), new String[0]); mapWork.setVectorMode(true); mapWork.setVectorizedRowBatchCtx(rbCtx); Utilities.setMapWork(conf, mapWork); } private StructObjectInspector createStructObjectInspector(Configuration conf) { // Create row related objects String columnNames = conf.get(IOConstants.COLUMNS); List<String> columnNamesList = DataWritableReadSupport.getColumnNames(columnNames); String columnTypes = conf.get(IOConstants.COLUMNS_TYPES); List<TypeInfo> columnTypesList = DataWritableReadSupport.getColumnTypes(columnTypes); TypeInfo rowTypeInfo = TypeInfoFactory.getStructTypeInfo(columnNamesList, columnTypesList); return new ArrayWritableObjectInspector((StructTypeInfo) rowTypeInfo); } /** * It deletes any temporary file created by prepareBenchmark. */ @TearDown(Level.Trial) public void cleanUpBenchmark() { readFile.delete(); } /** * This method is invoked before every call to the methods to test. It creates * resources that are needed for each call (not in a benchmark level). * * @throws IOException If it cannot writes temporary files. */ @Setup(Level.Invocation) public void prepareInvocation() throws Exception { recordWriterFile = createTempFile(); recordWriterPath = new Path(recordWriterFile.getPath()); recordWriter = storageFormatTest.getRecordWriter(writePath); recordReader = storageFormatTest.getRecordReader(readPath); } /** * This method is invoked after every call to the methods to test. It closes * and cleans up all temporary files. * * @throws IOException If it cannot close or delete temporary files. */ @TearDown(Level.Invocation) public void cleanUpInvocation() throws IOException { recordWriter.close(false); recordReader.close(); recordWriterFile.delete(); writeFile.delete(); } @Benchmark public void write() throws IOException { storageFormatTest.writeRecords(recordWriter, recordWritable); } @Benchmark public Object read() throws IOException { return storageFormatTest.readRecords(recordReader); } @Benchmark public Writable serialize() throws SerDeException { return storageFormatTest.serialize(rows[0], oi); } @Benchmark public Object deserialize() throws SerDeException { return storageFormatTest.deserialize(recordWritable[0]); } @Benchmark public RecordWriter getRecordWriter() throws IOException { return storageFormatTest.getRecordWriter(recordWriterPath); } @Benchmark public RecordReader getRecordReader() throws Exception { return storageFormatTest.getRecordReader(readPath); } public static void main(String args[]) throws Exception { Options opt = new OptionsBuilder() .include(ColumnarStorageBench.class.getSimpleName()) .warmupIterations(1) .measurementIterations(1) .forks(1) .build(); new Runner(opt).run(); } }