/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.facebook.presto.hive.benchmark; import com.facebook.presto.hadoop.HadoopNative; import com.facebook.presto.hive.HdfsEnvironment; import com.facebook.presto.hive.HiveClientConfig; import com.facebook.presto.hive.HiveCompressionCodec; import com.facebook.presto.hive.HiveSessionProperties; import com.facebook.presto.spi.ConnectorPageSource; import com.facebook.presto.spi.ConnectorSession; import com.facebook.presto.spi.Page; import com.facebook.presto.spi.PageBuilder; import com.facebook.presto.spi.block.BlockBuilder; import com.facebook.presto.spi.type.Type; import com.facebook.presto.testing.TestingConnectorSession; import com.facebook.presto.type.ArrayType; import com.google.common.collect.ImmutableList; import io.airlift.slice.Slices; import io.airlift.tpch.OrderColumn; import io.airlift.tpch.TpchColumn; import io.airlift.tpch.TpchEntity; import io.airlift.tpch.TpchTable; import io.airlift.units.DataSize; import it.unimi.dsi.fastutil.ints.IntArrays; import org.openjdk.jmh.annotations.AuxCounters; import org.openjdk.jmh.annotations.Benchmark; import org.openjdk.jmh.annotations.Fork; import org.openjdk.jmh.annotations.Measurement; import org.openjdk.jmh.annotations.OutputTimeUnit; import org.openjdk.jmh.annotations.Param; import org.openjdk.jmh.annotations.Scope; import org.openjdk.jmh.annotations.Setup; import org.openjdk.jmh.annotations.State; import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Warmup; import org.openjdk.jmh.results.RunResult; import org.openjdk.jmh.runner.Runner; import org.openjdk.jmh.runner.options.Options; import org.openjdk.jmh.runner.options.OptionsBuilder; import org.openjdk.jmh.util.Statistics; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.Random; import java.util.UUID; import java.util.concurrent.TimeUnit; import static com.facebook.presto.hive.HiveTestUtils.createTestHdfsEnvironment; import static com.facebook.presto.hive.HiveTestUtils.mapType; import static com.facebook.presto.spi.type.BigintType.BIGINT; import static com.facebook.presto.spi.type.DateType.DATE; import static com.facebook.presto.spi.type.DoubleType.DOUBLE; import static com.facebook.presto.spi.type.IntegerType.INTEGER; import static com.facebook.presto.spi.type.VarcharType.createUnboundedVarcharType; import static io.airlift.testing.FileUtils.createTempDir; import static io.airlift.testing.FileUtils.deleteRecursively; import static io.airlift.tpch.TpchTable.LINE_ITEM; import static io.airlift.tpch.TpchTable.ORDERS; import static io.airlift.units.DataSize.Unit.MEGABYTE; import static java.lang.String.format; import static java.util.stream.Collectors.toList; @State(Scope.Thread) @OutputTimeUnit(TimeUnit.SECONDS) @Measurement(iterations = 50) @Warmup(iterations = 20) @Fork(3) @SuppressWarnings("UseOfSystemOutOrSystemErr") public class HiveFileFormatBenchmark { private static final long MIN_DATA_SIZE = new DataSize(50, MEGABYTE).toBytes(); static { HadoopNative.requireHadoopNative(); } @SuppressWarnings("deprecation") private static final HiveClientConfig CONFIG = new HiveClientConfig() .setRcfileOptimizedReaderEnabled(true) .setParquetOptimizedReaderEnabled(true); private static final ConnectorSession SESSION = new TestingConnectorSession(new HiveSessionProperties(CONFIG) .getSessionProperties()); private static final HdfsEnvironment HDFS_ENVIRONMENT = createTestHdfsEnvironment(CONFIG); @Param({ "LINEITEM", "BIGINT_SEQUENTIAL", "BIGINT_RANDOM", "VARCHAR_SMALL", "VARCHAR_LARGE", "VARCHAR_DICTIONARY", "MAP_VARCHAR_DOUBLE", "LARGE_MAP_VARCHAR_DOUBLE", "MAP_INT_DOUBLE", "LARGE_MAP_INT_DOUBLE", "LARGE_ARRAY_VARCHAR", }) private DataSet dataSet; @Param({ "NONE", "SNAPPY", "GZIP", }) private HiveCompressionCodec compression; @Param({ "PRESTO_RCBINARY", "PRESTO_RCTEXT", "PRESTO_ORC", "PRESTO_DWRF", "PRESTO_PARQUET", "HIVE_RCBINARY", "HIVE_RCTEXT", "HIVE_ORC", "HIVE_DWRF", "HIVE_PARQUET", }) private FileFormat fileFormat; private TestData data; private File dataFile; private final File targetDir = createTempDir("presto-benchmark"); public HiveFileFormatBenchmark() { } public HiveFileFormatBenchmark(DataSet dataSet, HiveCompressionCodec compression, FileFormat fileFormat) { this.dataSet = dataSet; this.compression = compression; this.fileFormat = fileFormat; } @Setup public void setup() throws IOException { data = dataSet.createTestData(fileFormat); targetDir.mkdirs(); dataFile = new File(targetDir, UUID.randomUUID().toString()); writeData(dataFile); } @TearDown public void tearDown() { deleteRecursively(targetDir); } @SuppressWarnings("PublicField") @AuxCounters @State(Scope.Thread) public static class CompressionCounter { public long inputSize; public long outputSize; } @Benchmark public List<Page> read(CompressionCounter counter) throws IOException { if (!fileFormat.supports(data)) { throw new RuntimeException(fileFormat + " does not support data set " + dataSet); } List<Page> pages = new ArrayList<>(100); try (ConnectorPageSource pageSource = fileFormat.createFileFormatReader( SESSION, HDFS_ENVIRONMENT, dataFile, data.getColumnNames(), data.getColumnTypes())) { while (!pageSource.isFinished()) { Page page = pageSource.getNextPage(); if (page != null) { page.assureLoaded(); pages.add(page); } } } counter.inputSize += data.getSize(); counter.outputSize += dataFile.length(); return pages; } @Benchmark public File write(CompressionCounter counter) throws IOException { File targetFile = new File(targetDir, UUID.randomUUID().toString()); writeData(targetFile); counter.inputSize += data.getSize(); counter.outputSize += targetFile.length(); return targetFile; } private void writeData(File targetFile) throws IOException { List<Page> inputPages = data.getPages(); try (FormatWriter formatWriter = fileFormat.createFileFormatWriter( SESSION, targetFile, data.getColumnNames(), data.getColumnTypes(), compression)) { for (Page page : inputPages) { formatWriter.writePage(page); } } } public enum DataSet { LINEITEM { @Override public TestData createTestData(FileFormat format) { return createTpchDataSet(format, LINE_ITEM, LINE_ITEM.getColumns()); } }, BIGINT_SEQUENTIAL { @Override public TestData createTestData(FileFormat format) { return createTpchDataSet(format, ORDERS, OrderColumn.ORDER_KEY); } }, BIGINT_RANDOM { @Override public TestData createTestData(FileFormat format) { return createTpchDataSet(format, ORDERS, OrderColumn.CUSTOMER_KEY); } }, VARCHAR_SMALL { @Override public TestData createTestData(FileFormat format) { return createTpchDataSet(format, ORDERS, OrderColumn.CLERK); } }, VARCHAR_LARGE { @Override public TestData createTestData(FileFormat format) { return createTpchDataSet(format, ORDERS, OrderColumn.CLERK); } }, VARCHAR_DICTIONARY { @Override public TestData createTestData(FileFormat format) { return createTpchDataSet(format, ORDERS, OrderColumn.ORDER_PRIORITY); } }, MAP_VARCHAR_DOUBLE { private static final int MIN_ENTRIES = 1; private static final int MAX_ENTRIES = 5; @Override public TestData createTestData(FileFormat format) { Type type = mapType(createUnboundedVarcharType(), DOUBLE); Random random = new Random(1234); PageBuilder pageBuilder = new PageBuilder(ImmutableList.of(type)); ImmutableList.Builder<Page> pages = ImmutableList.builder(); int[] keys = new int[] {1, 2, 3, 4, 5}; long dataSize = 0; while (dataSize < MIN_DATA_SIZE) { pageBuilder.declarePosition(); BlockBuilder builder = pageBuilder.getBlockBuilder(0); BlockBuilder mapBuilder = builder.beginBlockEntry(); int entries = nextRandomBetween(random, MIN_ENTRIES, MAX_ENTRIES); IntArrays.shuffle(keys, random); for (int entryId = 0; entryId < entries; entryId++) { createUnboundedVarcharType().writeSlice(mapBuilder, Slices.utf8Slice("key" + keys[entryId])); DOUBLE.writeDouble(mapBuilder, random.nextDouble()); } builder.closeEntry(); if (pageBuilder.isFull()) { Page page = pageBuilder.build(); pages.add(page); pageBuilder.reset(); dataSize += page.getSizeInBytes(); } } return new TestData(ImmutableList.of("map"), ImmutableList.of(type), pages.build()); } }, LARGE_MAP_VARCHAR_DOUBLE { private static final int MIN_ENTRIES = 5_000; private static final int MAX_ENTRIES = 15_000; @Override public TestData createTestData(FileFormat format) { Type type = mapType(createUnboundedVarcharType(), DOUBLE); Random random = new Random(1234); PageBuilder pageBuilder = new PageBuilder(ImmutableList.of(type)); ImmutableList.Builder<Page> pages = ImmutableList.builder(); long dataSize = 0; while (dataSize < MIN_DATA_SIZE) { pageBuilder.declarePosition(); BlockBuilder builder = pageBuilder.getBlockBuilder(0); BlockBuilder mapBuilder = builder.beginBlockEntry(); int entries = nextRandomBetween(random, MIN_ENTRIES, MAX_ENTRIES); for (int entryId = 0; entryId < entries; entryId++) { createUnboundedVarcharType().writeSlice(mapBuilder, Slices.utf8Slice("key" + random.nextInt(10_000_000))); DOUBLE.writeDouble(mapBuilder, random.nextDouble()); } builder.closeEntry(); if (pageBuilder.isFull()) { Page page = pageBuilder.build(); pages.add(page); pageBuilder.reset(); dataSize += page.getSizeInBytes(); } } return new TestData(ImmutableList.of("map"), ImmutableList.of(type), pages.build()); } }, MAP_INT_DOUBLE { private static final int MIN_ENTRIES = 1; private static final int MAX_ENTRIES = 5; @Override public TestData createTestData(FileFormat format) { Type type = mapType(INTEGER, DOUBLE); Random random = new Random(1234); PageBuilder pageBuilder = new PageBuilder(ImmutableList.of(type)); ImmutableList.Builder<Page> pages = ImmutableList.builder(); int[] keys = new int[] {1, 2, 3, 4, 5}; long dataSize = 0; while (dataSize < MIN_DATA_SIZE) { pageBuilder.declarePosition(); BlockBuilder builder = pageBuilder.getBlockBuilder(0); BlockBuilder mapBuilder = builder.beginBlockEntry(); int entries = nextRandomBetween(random, MIN_ENTRIES, MAX_ENTRIES); IntArrays.shuffle(keys, random); for (int entryId = 0; entryId < entries; entryId++) { INTEGER.writeLong(mapBuilder, keys[entryId]); DOUBLE.writeDouble(mapBuilder, random.nextDouble()); } builder.closeEntry(); if (pageBuilder.isFull()) { Page page = pageBuilder.build(); pages.add(page); pageBuilder.reset(); dataSize += page.getSizeInBytes(); } } return new TestData(ImmutableList.of("map"), ImmutableList.of(type), pages.build()); } }, LARGE_MAP_INT_DOUBLE { private static final int MIN_ENTRIES = 5_000; private static final int MAX_ENTRIES = 15_0000; @Override public TestData createTestData(FileFormat format) { Type type = mapType(INTEGER, DOUBLE); Random random = new Random(1234); PageBuilder pageBuilder = new PageBuilder(ImmutableList.of(type)); ImmutableList.Builder<Page> pages = ImmutableList.builder(); long dataSize = 0; while (dataSize < MIN_DATA_SIZE) { pageBuilder.declarePosition(); BlockBuilder builder = pageBuilder.getBlockBuilder(0); BlockBuilder mapBuilder = builder.beginBlockEntry(); int entries = nextRandomBetween(random, MIN_ENTRIES, MAX_ENTRIES); for (int entryId = 0; entryId < entries; entryId++) { INTEGER.writeLong(mapBuilder, random.nextInt(10_000_000)); DOUBLE.writeDouble(mapBuilder, random.nextDouble()); } builder.closeEntry(); if (pageBuilder.isFull()) { Page page = pageBuilder.build(); pages.add(page); pageBuilder.reset(); dataSize += page.getSizeInBytes(); } } return new TestData(ImmutableList.of("map"), ImmutableList.of(type), pages.build()); } }, LARGE_ARRAY_VARCHAR { private static final int MIN_ENTRIES = 5_000; private static final int MAX_ENTRIES = 15_0000; @Override public TestData createTestData(FileFormat format) { Type type = new ArrayType(createUnboundedVarcharType()); Random random = new Random(1234); PageBuilder pageBuilder = new PageBuilder(ImmutableList.of(type)); ImmutableList.Builder<Page> pages = ImmutableList.builder(); long dataSize = 0; while (dataSize < MIN_DATA_SIZE) { pageBuilder.declarePosition(); BlockBuilder builder = pageBuilder.getBlockBuilder(0); BlockBuilder mapBuilder = builder.beginBlockEntry(); int entries = nextRandomBetween(random, MIN_ENTRIES, MAX_ENTRIES); for (int entryId = 0; entryId < entries; entryId++) { createUnboundedVarcharType().writeSlice(mapBuilder, Slices.utf8Slice("key" + random.nextInt(10_000_000))); } builder.closeEntry(); if (pageBuilder.isFull()) { Page page = pageBuilder.build(); pages.add(page); pageBuilder.reset(); dataSize += page.getSizeInBytes(); } } return new TestData(ImmutableList.of("map"), ImmutableList.of(type), pages.build()); } }; public abstract TestData createTestData(FileFormat format); } @SafeVarargs private static <E extends TpchEntity> TestData createTpchDataSet(FileFormat format, TpchTable<E> tpchTable, TpchColumn<E>... columns) { return createTpchDataSet(format, tpchTable, ImmutableList.copyOf(columns)); } private static <E extends TpchEntity> TestData createTpchDataSet(FileFormat format, TpchTable<E> tpchTable, List<TpchColumn<E>> columns) { List<String> columnNames = columns.stream().map(TpchColumn::getColumnName).collect(toList()); List<Type> columnTypes = columns.stream().map(HiveFileFormatBenchmark::getColumnType) .map(type -> format.supportsDate() || !DATE.equals(type) ? type : createUnboundedVarcharType()) .collect(toList()); PageBuilder pageBuilder = new PageBuilder(columnTypes); ImmutableList.Builder<Page> pages = ImmutableList.builder(); long dataSize = 0; for (E row : tpchTable.createGenerator(10, 1, 1)) { pageBuilder.declarePosition(); for (int i = 0; i < columns.size(); i++) { TpchColumn<E> column = columns.get(i); BlockBuilder blockBuilder = pageBuilder.getBlockBuilder(i); switch (column.getType().getBase()) { case IDENTIFIER: BIGINT.writeLong(blockBuilder, column.getIdentifier(row)); break; case INTEGER: INTEGER.writeLong(blockBuilder, column.getInteger(row)); break; case DATE: if (format.supportsDate()) { DATE.writeLong(blockBuilder, column.getDate(row)); } else { createUnboundedVarcharType().writeString(blockBuilder, column.getString(row)); } break; case DOUBLE: DOUBLE.writeDouble(blockBuilder, column.getDouble(row)); break; case VARCHAR: createUnboundedVarcharType().writeSlice(blockBuilder, Slices.utf8Slice(column.getString(row))); break; default: throw new IllegalArgumentException("Unsupported type " + column.getType()); } } if (pageBuilder.isFull()) { Page page = pageBuilder.build(); pages.add(page); pageBuilder.reset(); dataSize += page.getSizeInBytes(); if (dataSize >= MIN_DATA_SIZE) { break; } } } return new TestData(columnNames, columnTypes, pages.build()); } static class TestData { private final List<String> columnNames; private final List<Type> columnTypes; private final List<Page> pages; private final int size; public TestData(List<String> columnNames, List<Type> columnTypes, List<Page> pages) { this.columnNames = ImmutableList.copyOf(columnNames); this.columnTypes = ImmutableList.copyOf(columnTypes); this.pages = ImmutableList.copyOf(pages); this.size = (int) pages.stream().mapToLong(Page::getSizeInBytes).sum(); } public List<String> getColumnNames() { return columnNames; } public List<Type> getColumnTypes() { return columnTypes; } public List<Page> getPages() { return pages; } public int getSize() { return size; } } private static Type getColumnType(TpchColumn<?> input) { switch (input.getType().getBase()) { case IDENTIFIER: return BIGINT; case INTEGER: return INTEGER; case DATE: return DATE; case DOUBLE: return DOUBLE; case VARCHAR: return createUnboundedVarcharType(); } throw new IllegalArgumentException("Unsupported type " + input.getType()); } public static void main(String[] args) throws Exception { Options opt = new OptionsBuilder() .include(".*\\." + HiveFileFormatBenchmark.class.getSimpleName() + ".*") .jvmArgsAppend("-Xmx4g", "-Xms4g", "-XX:+UseG1GC") .build(); Collection<RunResult> results = new Runner(opt).run(); for (RunResult result : results) { Statistics inputSizeStats = result.getSecondaryResults().get("inputSize").getStatistics(); Statistics outputSizeStats = result.getSecondaryResults().get("outputSize").getStatistics(); double compressionRatio = 1.0 * inputSizeStats.getSum() / outputSizeStats.getSum(); String compression = result.getParams().getParam("compression"); String fileFormat = result.getParams().getParam("fileFormat"); String dataSet = result.getParams().getParam("dataSet"); System.out.printf(" %-10s %-30s %-10s %-25s %2.2f %10s ± %11s (%5.2f%%) (N = %d, \u03B1 = 99.9%%)\n", result.getPrimaryResult().getLabel(), dataSet, compression, fileFormat, compressionRatio, toHumanReadableSpeed((long) inputSizeStats.getMean()), toHumanReadableSpeed((long) inputSizeStats.getMeanErrorAt(0.999)), inputSizeStats.getMeanErrorAt(0.999) * 100 / inputSizeStats.getMean(), inputSizeStats.getN()); } System.out.println(); } private static String toHumanReadableSpeed(long bytesPerSecond) { String humanReadableSpeed; if (bytesPerSecond < 1024 * 10L) { humanReadableSpeed = format("%dB/s", bytesPerSecond); } else if (bytesPerSecond < 1024 * 1024 * 10L) { humanReadableSpeed = format("%.1fkB/s", bytesPerSecond / 1024.0f); } else if (bytesPerSecond < 1024 * 1024 * 1024 * 10L) { humanReadableSpeed = format("%.1fMB/s", bytesPerSecond / (1024.0f * 1024.0f)); } else { humanReadableSpeed = format("%.1fGB/s", bytesPerSecond / (1024.0f * 1024.0f * 1024.0f)); } return humanReadableSpeed; } private static int nextRandomBetween(Random random, int min, int max) { return min + random.nextInt(max - min); } }