ColumnarStorageBench.java example

Explorer
hive-master
/**
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hive.benchmark.storage;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx;
import org.apache.hadoop.hive.ql.io.HiveOutputFormat;
import org.apache.hadoop.hive.ql.io.IOConstants;
import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat;
import org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat;
import org.apache.hadoop.hive.ql.io.orc.OrcSerde;
import org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat;
import org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat;
import org.apache.hadoop.hive.ql.io.parquet.read.DataWritableReadSupport;
import org.apache.hadoop.hive.ql.io.parquet.serde.ArrayWritableObjectInspector;
import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe;
import org.apache.hadoop.hive.ql.io.parquet.vector.VectorizedParquetRecordReader;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.plan.MapWork;
import org.apache.hadoop.hive.serde2.AbstractSerDe;
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.SerDeUtils;
import org.apache.hadoop.hive.serde2.io.DoubleWritable;
import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.ArrayWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.BooleanWritable;
import org.apache.hadoop.io.BytesWritable;

import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.parquet.hadoop.ParquetInputFormat;
import org.apache.parquet.hadoop.ParquetInputSplit;
import org.apache.parquet.hadoop.api.ReadSupport;
import org.apache.parquet.hadoop.example.GroupReadSupport;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.TearDown;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.Level;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.runner.Runner;
import org.openjdk.jmh.runner.options.Options;
import org.openjdk.jmh.runner.options.OptionsBuilder;

import java.io.File;
import java.io.IOException;
import java.net.URI;
import java.util.Properties;
import java.util.List;
import java.util.Random;
import java.util.ArrayList;
import java.util.Arrays;

@State(Scope.Benchmark)
public class ColumnarStorageBench {
 /**
  * This test measures the performance between different columnar storage formats used
  * by Hive. If you need to add more formats, see the 'format' gobal variable to add
  * a new one on the list, and create a class that implements StorageFormatTest interface.
  *
  * This test uses JMH framework for benchmarking.
  * You may execute this benchmark tool using JMH command line in different ways:
  *
  * To use the settings shown in the main() function, use:
  * $ java -cp target/benchmarks.jar org.apache.hive.benchmark.storage.ColumnarStorageBench
  *
  * To use the default settings used by JMH, use:
  * $ java -jar target/benchmarks.jar org.apache.hive.benchmark.storage ColumnStorageBench
  *
  * To specify different parameters, use:
  * - This command will use 10 warm-up iterations, 5 test iterations, and 2 forks. And it will
  *   display the Average Time (avgt) in Microseconds (us)
  * - Benchmark mode. Available modes are:
  *   [Throughput/thrpt, AverageTime/avgt, SampleTime/sample, SingleShotTime/ss, All/all]
  * - Output time unit. Available time units are: [m, s, ms, us, ns].
  *
  * $ java -jar target/benchmarks.jar org.apache.hive.benchmark.storage ColumnStorageBench -wi 10 -i 5 -f 2 -bm avgt -tu us
  */

  private static final String DEFAULT_TEMP_LOCATION = "/tmp";

  private File writeFile, readFile, recordWriterFile;
  private Path writePath, readPath, recordWriterPath;
  private FileSystem fs;

  /**
   * LazySimpleSerDe is used to create our testing rows.
   */
  private LazySimpleSerDe lazySimpleSerDe;

  /**
   * Contains implementation for the storage format to test
   */
  private StorageFormatTest storageFormatTest;

  private RecordWriter recordWriter;
  private RecordReader recordReader;

  /**
   * These objects contains the record to be tested.
   */
  private Writable recordWritable[];
  private Object rows[];
  private ObjectInspector oi;

  /**
   * These column types are used for the record that will be tested.
   */
  private Properties recordProperties;
  private String DEFAULT_COLUMN_TYPES = "int,double,boolean,string,array<int>,map<string,string>,struct<a:int,b:int>";

  public ColumnarStorageBench() {
    recordProperties = new Properties();
    recordProperties.setProperty("columns", getColumnNames(DEFAULT_COLUMN_TYPES));
    recordProperties.setProperty("columns.types", DEFAULT_COLUMN_TYPES);

    final int NUMBER_OF_ROWS_TO_TEST = 100;
    rows = new Object[NUMBER_OF_ROWS_TO_TEST];
    recordWritable = new Writable[NUMBER_OF_ROWS_TO_TEST];

    /**
     * We use LazySimpleSerDe to generate our testing rows.
     */

    try {
      lazySimpleSerDe = new LazySimpleSerDe();
      SerDeUtils.initializeSerDe(lazySimpleSerDe, new Configuration(), recordProperties, null);
      oi = lazySimpleSerDe.getObjectInspector();

      for (int i = 0; i < NUMBER_OF_ROWS_TO_TEST; i++) {
        rows[i] = createRandomRow(DEFAULT_COLUMN_TYPES);
      }
    } catch(SerDeException e) {
      e.printStackTrace();
    }
  }

  private String getColumnNames(final String columnTypes) {
    StringBuilder columnNames = new StringBuilder();

    /* Construct a string of column names based on the number of column types */
    List<TypeInfo> columnTypesList = TypeInfoUtils.getTypeInfosFromTypeString(columnTypes);
    for (int i=0; i < columnTypesList.size(); i++) {
      if (i > 0) {
        columnNames.append(",");
      }
      columnNames.append("c" + i);
    }

    return columnNames.toString();
  }

  private long fileLength(Path path) throws IOException {
    return fs.getFileStatus(path).getLen();
  }

  private ArrayWritable record(Writable... fields) {
    return new ArrayWritable(Writable.class, fields);
  }

  private Writable getPrimitiveWritable(final PrimitiveTypeInfo typeInfo) {
    Random rand = new Random();

    switch (typeInfo.getPrimitiveCategory()) {
      case INT:
        return new IntWritable(rand.nextInt());
      case DOUBLE:
        return new DoubleWritable(rand.nextDouble());
      case BOOLEAN:
        return new BooleanWritable(rand.nextBoolean());
      case CHAR:
      case VARCHAR:
      case STRING:
        byte b[] = new byte[30];
        rand.nextBytes(b);
        return new BytesWritable(b);
      default:
        throw new IllegalArgumentException("Invalid primitive type: " + typeInfo.getTypeName());
    }
  }

  private ArrayWritable createRecord(final List<TypeInfo> columnTypes) {
    Writable[] fields = new Writable[columnTypes.size()];

    int pos=0;
    for (TypeInfo type : columnTypes) {
      switch (type.getCategory()) {
        case PRIMITIVE:
          fields[pos++] = getPrimitiveWritable((PrimitiveTypeInfo)type);
        break;
        case LIST: {
          List<TypeInfo> elementType = new ArrayList<TypeInfo>();
          elementType.add(((ListTypeInfo) type).getListElementTypeInfo());
          fields[pos++] = createRecord(elementType);
        } break;
        case MAP: {
          List<TypeInfo> keyValueType = new ArrayList<TypeInfo>();
          keyValueType.add(((MapTypeInfo) type).getMapKeyTypeInfo());
          keyValueType.add(((MapTypeInfo) type).getMapValueTypeInfo());
          fields[pos++] = record(createRecord(keyValueType));
        } break;
        case STRUCT: {
          List<TypeInfo> elementType = ((StructTypeInfo) type).getAllStructFieldTypeInfos();
          fields[pos++] = createRecord(elementType);
        } break;
        default:
          throw new IllegalStateException("Invalid column type: " + type);
      }
    }

    return record(fields);
  }

  private ObjectInspector getArrayWritableObjectInspector(final String columnTypes) {
    List<TypeInfo> columnTypeList = TypeInfoUtils.getTypeInfosFromTypeString(columnTypes);
    List<String> columnNameList = Arrays.asList(getColumnNames(columnTypes).split(","));
    StructTypeInfo rowTypeInfo = (StructTypeInfo)TypeInfoFactory.getStructTypeInfo(columnNameList, columnTypeList);

    return new ArrayWritableObjectInspector(rowTypeInfo);
  }

  private Object createRandomRow(final String columnTypes) throws SerDeException {
    Writable recordWritable = createRecord(TypeInfoUtils.getTypeInfosFromTypeString(columnTypes));
    Writable simpleWritable = lazySimpleSerDe.serialize(recordWritable, getArrayWritableObjectInspector(columnTypes));
    return lazySimpleSerDe.deserialize(simpleWritable);
  }

  /**
   * This class encapsulates all methods that will be called by each of the @Benchmark
   * methods.
   */
  private class StorageFormatTest {
    private AbstractSerDe serDe;
    private JobConf jobConf;
    private HiveOutputFormat outputFormat;
    private InputFormat inputFormat;

    public StorageFormatTest(AbstractSerDe serDeImpl, HiveOutputFormat outputFormatImpl, InputFormat inputFormatImpl) throws SerDeException {
      jobConf = new JobConf();
      serDe = serDeImpl;
      outputFormat = outputFormatImpl;
      inputFormat = inputFormatImpl;

      Configuration conf = new Configuration();
      SerDeUtils.initializeSerDe(serDe, conf, recordProperties, null);
    }

    public Writable serialize(Object row, ObjectInspector oi) throws SerDeException {
      return serDe.serialize(row, oi);
    }

    public Object deserialize(Writable record) throws SerDeException {
      return serDe.deserialize(record);
    }

    /* We write many records because sometimes the RecordWriter for the format to test
     * behaves different with one record than a bunch of records */
    public void writeRecords(RecordWriter writer, Writable records[]) throws IOException {
      for (int i=0; i < records.length; i++) {
        writer.write(records[i]);
      }
    }

    /* We read many records because sometimes the RecordReader for the format to test
     * behaves different with one record than a bunch of records */
    public Object readRecords(RecordReader reader) throws IOException {
      Object alwaysNull = reader.createKey();
      Object record = reader.createValue();

      // Just loop through all values. We do not need to store anything though.
      // This is just for test purposes
      while (reader.next(alwaysNull, record)) ;

      return record;
    }

    public RecordWriter getRecordWriter(Path outputPath) throws IOException {
      return outputFormat.getHiveRecordWriter(jobConf, outputPath, null, false, recordProperties, null);
    }

    public RecordReader getRecordReader(Path inputPath) throws Exception {
      if ("parquet".equals(format) || "orc".equals(format)) {
        return inputFormat.getRecordReader(
            new FileSplit(inputPath, 0, fileLength(inputPath), (String[]) null),
            jobConf, null);
      } else if ("parquet-vec".equals(format)) {
        return getVectorizedRecordReader(inputPath);
      } else {
        throw new IllegalArgumentException("Invalid file format argument: " + format);
      }
    }

    public RecordReader getVectorizedRecordReader(Path inputPath) throws Exception {
      Configuration conf = new Configuration();
      conf.set(IOConstants.COLUMNS, getColumnNames(DEFAULT_COLUMN_TYPES));
      conf.set(IOConstants.COLUMNS_TYPES, DEFAULT_COLUMN_TYPES);
      // TODO: VectorizedParquetRecordReader doesn't support map, array now, the value of
      // ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR should be updated after support these data
      // types.
      conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false);
      conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0,1,2,3,6");
      conf.set(ReadSupport.PARQUET_READ_SCHEMA, "test schema");
      HiveConf.setBoolVar(conf, HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, true);
      HiveConf.setVar(conf, HiveConf.ConfVars.PLAN, "//tmp");
      Job vectorJob = new Job(conf, "read vector");
      ParquetInputFormat.setInputPaths(vectorJob, inputPath);
      ParquetInputFormat parquetInputFormat = new ParquetInputFormat(GroupReadSupport.class);
      ParquetInputSplit split = (ParquetInputSplit) parquetInputFormat.getSplits(vectorJob).get(0);
      initialVectorizedRowBatchCtx(conf);
      return new VectorizedParquetRecordReader(split, new JobConf(conf));
    }
  }

  /**
   * This class is called to run I/O parquet tests.
   */
  private class ParquetStorageFormatTest extends StorageFormatTest {
    public ParquetStorageFormatTest() throws SerDeException {
      super(new ParquetHiveSerDe(), new MapredParquetOutputFormat(), new MapredParquetInputFormat());
    }
  }

  /**
   * This class is called to run i/o orc tests.
   */
  private class OrcStorageFormatTest extends StorageFormatTest {
    public OrcStorageFormatTest() throws SerDeException {
      super(new OrcSerde(), new OrcOutputFormat(), new OrcInputFormat());
    }
  }

  private File createTempFile() throws IOException {
    if (URI.create(DEFAULT_TEMP_LOCATION).getScheme() != null) {
      throw new IOException("Cannot create temporary files in a non-local file-system: Operation not permitted.");
    }

    File temp = File.createTempFile(this.toString(), null, new File(DEFAULT_TEMP_LOCATION));
    temp.deleteOnExit();
    temp.delete();

    return temp;
  }

  // Test different format types
  @Param({"orc", "parquet", "parquet-vec"})
  public String format;

  /**
   * Initializes resources that will be needed for each of the benchmark tests.
   *
   * @throws SerDeException If it cannot initialize the desired test format.
   * @throws IOException If it cannot write data to temporary files.
   */
  @Setup(Level.Trial)
  public void prepareBenchmark() throws SerDeException, IOException {
    if (format.equalsIgnoreCase("parquet") || format.equalsIgnoreCase("parquet-vec")) {
      storageFormatTest = new ParquetStorageFormatTest();
    } else if (format.equalsIgnoreCase("orc")) {
      storageFormatTest = new OrcStorageFormatTest();
    } else {
      throw new IllegalArgumentException("Invalid file format argument: " + format);
    }

    for (int i=0; i < rows.length; i++) {
      recordWritable[i] = storageFormatTest.serialize(rows[i], oi);
    }

    fs = FileSystem.getLocal(new Configuration());

    writeFile = createTempFile();
    writePath = new Path(writeFile.getPath());

    readFile = createTempFile();
    readPath = new Path(readFile.getPath());

    /*
     * Write a bunch of random rows that will be used for read benchmark.
     */
    RecordWriter writer = storageFormatTest.getRecordWriter(readPath);
    storageFormatTest.writeRecords(writer, recordWritable);
    writer.close(false);
  }

  private void initialVectorizedRowBatchCtx(Configuration conf) throws HiveException {
    MapWork mapWork = new MapWork();
    VectorizedRowBatchCtx rbCtx = new VectorizedRowBatchCtx();
    rbCtx.init(createStructObjectInspector(conf), new String[0]);
    mapWork.setVectorMode(true);
    mapWork.setVectorizedRowBatchCtx(rbCtx);
    Utilities.setMapWork(conf, mapWork);
  }

  private StructObjectInspector createStructObjectInspector(Configuration conf) {
    // Create row related objects
    String columnNames = conf.get(IOConstants.COLUMNS);
    List<String> columnNamesList = DataWritableReadSupport.getColumnNames(columnNames);
    String columnTypes = conf.get(IOConstants.COLUMNS_TYPES);
    List<TypeInfo> columnTypesList = DataWritableReadSupport.getColumnTypes(columnTypes);
    TypeInfo rowTypeInfo = TypeInfoFactory.getStructTypeInfo(columnNamesList, columnTypesList);
    return new ArrayWritableObjectInspector((StructTypeInfo) rowTypeInfo);
  }

  /**
   * It deletes any temporary file created by prepareBenchmark.
   */
  @TearDown(Level.Trial)
  public void cleanUpBenchmark() {
    readFile.delete();
  }

  /**
   * This method is invoked before every call to the methods to test. It creates
   * resources that are needed for each call (not in a benchmark level).
   *
   * @throws IOException If it cannot writes temporary files.
   */
  @Setup(Level.Invocation)
  public void prepareInvocation() throws Exception {
    recordWriterFile = createTempFile();
    recordWriterPath = new Path(recordWriterFile.getPath());

    recordWriter = storageFormatTest.getRecordWriter(writePath);
    recordReader = storageFormatTest.getRecordReader(readPath);
  }

  /**
   * This method is invoked after every call to the methods to test. It closes
   * and cleans up all temporary files.
   *
   * @throws IOException If it cannot close or delete temporary files.
   */
  @TearDown(Level.Invocation)
  public void cleanUpInvocation() throws IOException {
    recordWriter.close(false);
    recordReader.close();

    recordWriterFile.delete();
    writeFile.delete();
  }

  @Benchmark
  public void write() throws IOException {
    storageFormatTest.writeRecords(recordWriter, recordWritable);
  }

  @Benchmark
  public Object read() throws IOException {
    return storageFormatTest.readRecords(recordReader);
  }

  @Benchmark
  public Writable serialize() throws SerDeException {
    return storageFormatTest.serialize(rows[0], oi);
  }

  @Benchmark
  public Object deserialize() throws SerDeException {
    return storageFormatTest.deserialize(recordWritable[0]);
  }

  @Benchmark
  public RecordWriter getRecordWriter() throws IOException {
    return storageFormatTest.getRecordWriter(recordWriterPath);
  }

  @Benchmark
  public RecordReader getRecordReader() throws Exception {
    return storageFormatTest.getRecordReader(readPath);
  }

  public static void main(String args[]) throws Exception {
    Options opt = new OptionsBuilder()
        .include(ColumnarStorageBench.class.getSimpleName())
        .warmupIterations(1)
        .measurementIterations(1)
        .forks(1)
        .build();

    new Runner(opt).run();
  }
}