/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.facebook.presto.raptor.storage; import com.facebook.presto.block.BlockEncodingManager; import com.facebook.presto.metadata.FunctionRegistry; import com.facebook.presto.orc.OrcDataSource; import com.facebook.presto.orc.OrcRecordReader; import com.facebook.presto.raptor.storage.OrcFileRewriter.OrcFileInfo; import com.facebook.presto.spi.Page; import com.facebook.presto.spi.block.Block; import com.facebook.presto.spi.type.StandardTypes; import com.facebook.presto.spi.type.Type; import com.facebook.presto.spi.type.TypeManager; import com.facebook.presto.spi.type.TypeSignature; import com.facebook.presto.spi.type.TypeSignatureParameter; import com.facebook.presto.sql.analyzer.FeaturesConfig; import com.facebook.presto.type.ArrayType; import com.facebook.presto.type.TypeRegistry; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import io.airlift.json.JsonCodec; import org.testng.annotations.AfterClass; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; import java.io.File; import java.util.BitSet; import java.util.List; import static com.facebook.presto.RowPagesBuilder.rowPagesBuilder; import static com.facebook.presto.raptor.storage.OrcTestingUtil.createReader; import static com.facebook.presto.raptor.storage.OrcTestingUtil.fileOrcDataSource; import static com.facebook.presto.spi.type.BigintType.BIGINT; import static com.facebook.presto.spi.type.BooleanType.BOOLEAN; import static com.facebook.presto.spi.type.DoubleType.DOUBLE; import static com.facebook.presto.spi.type.VarbinaryType.VARBINARY; import static com.facebook.presto.spi.type.VarcharType.createVarcharType; import static com.facebook.presto.tests.StructuralTestUtil.arrayBlockOf; import static com.facebook.presto.tests.StructuralTestUtil.arrayBlocksEqual; import static com.facebook.presto.tests.StructuralTestUtil.mapBlockOf; import static com.facebook.presto.tests.StructuralTestUtil.mapBlocksEqual; import static com.google.common.io.Files.createTempDir; import static io.airlift.json.JsonCodec.jsonCodec; import static io.airlift.slice.Slices.utf8Slice; import static io.airlift.testing.FileUtils.deleteRecursively; import static java.nio.file.Files.readAllBytes; import static java.util.UUID.randomUUID; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertFalse; import static org.testng.Assert.assertTrue; @Test(singleThreaded = true) public class TestOrcFileRewriter { private static final JsonCodec<OrcFileMetadata> METADATA_CODEC = jsonCodec(OrcFileMetadata.class); private File temporary; @BeforeClass public void setup() throws Exception { temporary = createTempDir(); } @AfterClass(alwaysRun = true) public void tearDown() throws Exception { deleteRecursively(temporary); } @Test public void testRewrite() throws Exception { TypeManager typeManager = new TypeRegistry(); // associate typeManager with a function registry new FunctionRegistry(typeManager, new BlockEncodingManager(typeManager), new FeaturesConfig()); ArrayType arrayType = new ArrayType(BIGINT); ArrayType arrayOfArrayType = new ArrayType(arrayType); Type mapType = typeManager.getParameterizedType(StandardTypes.MAP, ImmutableList.of( TypeSignatureParameter.of(createVarcharType(5).getTypeSignature()), TypeSignatureParameter.of(BOOLEAN.getTypeSignature()))); List<Long> columnIds = ImmutableList.of(3L, 7L, 9L, 10L, 11L); List<Type> columnTypes = ImmutableList.of(BIGINT, createVarcharType(20), arrayType, mapType, arrayOfArrayType); File file = new File(temporary, randomUUID().toString()); try (OrcFileWriter writer = new OrcFileWriter(columnIds, columnTypes, file)) { List<Page> pages = rowPagesBuilder(columnTypes) .row(123L, "hello", arrayBlockOf(BIGINT, 1, 2), mapBlockOf(createVarcharType(5), BOOLEAN, "k1", true), arrayBlockOf(arrayType, arrayBlockOf(BIGINT, 5))) .row(777L, "sky", arrayBlockOf(BIGINT, 3, 4), mapBlockOf(createVarcharType(5), BOOLEAN, "k2", false), arrayBlockOf(arrayType, arrayBlockOf(BIGINT, 6))) .row(456L, "bye", arrayBlockOf(BIGINT, 5, 6), mapBlockOf(createVarcharType(5), BOOLEAN, "k3", true), arrayBlockOf(arrayType, arrayBlockOf(BIGINT, 7))) .row(888L, "world", arrayBlockOf(BIGINT, 7, 8), mapBlockOf(createVarcharType(5), BOOLEAN, "k4", true), arrayBlockOf(arrayType, null, arrayBlockOf(BIGINT, 8), null)) .row(999L, "done", arrayBlockOf(BIGINT, 9, 10), mapBlockOf(createVarcharType(5), BOOLEAN, "k5", true), arrayBlockOf(arrayType, arrayBlockOf(BIGINT, 9, 10))) .build(); writer.appendPages(pages); } try (OrcDataSource dataSource = fileOrcDataSource(file)) { OrcRecordReader reader = createReader(dataSource, columnIds, columnTypes); assertEquals(reader.getReaderRowCount(), 5); assertEquals(reader.getFileRowCount(), 5); assertEquals(reader.getSplitLength(), file.length()); assertEquals(reader.nextBatch(), 5); Block column0 = reader.readBlock(BIGINT, 0); assertEquals(column0.getPositionCount(), 5); for (int i = 0; i < 5; i++) { assertEquals(column0.isNull(i), false); } assertEquals(BIGINT.getLong(column0, 0), 123L); assertEquals(BIGINT.getLong(column0, 1), 777L); assertEquals(BIGINT.getLong(column0, 2), 456L); assertEquals(BIGINT.getLong(column0, 3), 888L); assertEquals(BIGINT.getLong(column0, 4), 999L); Block column1 = reader.readBlock(createVarcharType(20), 1); assertEquals(column1.getPositionCount(), 5); for (int i = 0; i < 5; i++) { assertEquals(column1.isNull(i), false); } assertEquals(createVarcharType(20).getSlice(column1, 0), utf8Slice("hello")); assertEquals(createVarcharType(20).getSlice(column1, 1), utf8Slice("sky")); assertEquals(createVarcharType(20).getSlice(column1, 2), utf8Slice("bye")); assertEquals(createVarcharType(20).getSlice(column1, 3), utf8Slice("world")); assertEquals(createVarcharType(20).getSlice(column1, 4), utf8Slice("done")); Block column2 = reader.readBlock(arrayType, 2); assertEquals(column2.getPositionCount(), 5); for (int i = 0; i < 5; i++) { assertEquals(column2.isNull(i), false); } assertTrue(arrayBlocksEqual(BIGINT, arrayType.getObject(column2, 0), arrayBlockOf(BIGINT, 1, 2))); assertTrue(arrayBlocksEqual(BIGINT, arrayType.getObject(column2, 1), arrayBlockOf(BIGINT, 3, 4))); assertTrue(arrayBlocksEqual(BIGINT, arrayType.getObject(column2, 2), arrayBlockOf(BIGINT, 5, 6))); assertTrue(arrayBlocksEqual(BIGINT, arrayType.getObject(column2, 3), arrayBlockOf(BIGINT, 7, 8))); assertTrue(arrayBlocksEqual(BIGINT, arrayType.getObject(column2, 4), arrayBlockOf(BIGINT, 9, 10))); Block column3 = reader.readBlock(mapType, 3); assertEquals(column3.getPositionCount(), 5); for (int i = 0; i < 5; i++) { assertEquals(column3.isNull(i), false); } assertTrue(mapBlocksEqual(createVarcharType(5), BOOLEAN, arrayType.getObject(column3, 0), mapBlockOf(createVarcharType(5), BOOLEAN, "k1", true))); assertTrue(mapBlocksEqual(createVarcharType(5), BOOLEAN, arrayType.getObject(column3, 1), mapBlockOf(createVarcharType(5), BOOLEAN, "k2", false))); assertTrue(mapBlocksEqual(createVarcharType(5), BOOLEAN, arrayType.getObject(column3, 2), mapBlockOf(createVarcharType(5), BOOLEAN, "k3", true))); assertTrue(mapBlocksEqual(createVarcharType(5), BOOLEAN, arrayType.getObject(column3, 3), mapBlockOf(createVarcharType(5), BOOLEAN, "k4", true))); assertTrue(mapBlocksEqual(createVarcharType(5), BOOLEAN, arrayType.getObject(column3, 4), mapBlockOf(createVarcharType(5), BOOLEAN, "k5", true))); Block column4 = reader.readBlock(arrayOfArrayType, 4); assertEquals(column4.getPositionCount(), 5); for (int i = 0; i < 5; i++) { assertEquals(column4.isNull(i), false); } assertTrue(arrayBlocksEqual(arrayType, arrayOfArrayType.getObject(column4, 0), arrayBlockOf(arrayType, arrayBlockOf(BIGINT, 5)))); assertTrue(arrayBlocksEqual(arrayType, arrayOfArrayType.getObject(column4, 1), arrayBlockOf(arrayType, arrayBlockOf(BIGINT, 6)))); assertTrue(arrayBlocksEqual(arrayType, arrayOfArrayType.getObject(column4, 2), arrayBlockOf(arrayType, arrayBlockOf(BIGINT, 7)))); assertTrue(arrayBlocksEqual(arrayType, arrayOfArrayType.getObject(column4, 3), arrayBlockOf(arrayType, null, arrayBlockOf(BIGINT, 8), null))); assertTrue(arrayBlocksEqual(arrayType, arrayOfArrayType.getObject(column4, 4), arrayBlockOf(arrayType, arrayBlockOf(BIGINT, 9, 10)))); assertEquals(reader.nextBatch(), -1); OrcFileMetadata orcFileMetadata = METADATA_CODEC.fromJson(reader.getUserMetadata().get(OrcFileMetadata.KEY).getBytes()); assertEquals(orcFileMetadata, new OrcFileMetadata(ImmutableMap.<Long, TypeSignature>builder() .put(3L, BIGINT.getTypeSignature()) .put(7L, createVarcharType(20).getTypeSignature()) .put(9L, arrayType.getTypeSignature()) .put(10L, mapType.getTypeSignature()) .put(11L, arrayOfArrayType.getTypeSignature()) .build() )); } BitSet rowsToDelete = new BitSet(5); rowsToDelete.set(1); rowsToDelete.set(3); rowsToDelete.set(4); File newFile = new File(temporary, randomUUID().toString()); OrcFileInfo info = OrcFileRewriter.rewrite(file, newFile, rowsToDelete); assertEquals(info.getRowCount(), 2); assertEquals(info.getUncompressedSize(), 78); try (OrcDataSource dataSource = fileOrcDataSource(newFile)) { OrcRecordReader reader = createReader(dataSource, columnIds, columnTypes); assertEquals(reader.getReaderRowCount(), 2); assertEquals(reader.getFileRowCount(), 2); assertEquals(reader.getSplitLength(), newFile.length()); assertEquals(reader.nextBatch(), 2); Block column0 = reader.readBlock(BIGINT, 0); assertEquals(column0.getPositionCount(), 2); for (int i = 0; i < 2; i++) { assertEquals(column0.isNull(i), false); } assertEquals(BIGINT.getLong(column0, 0), 123L); assertEquals(BIGINT.getLong(column0, 1), 456L); Block column1 = reader.readBlock(createVarcharType(20), 1); assertEquals(column1.getPositionCount(), 2); for (int i = 0; i < 2; i++) { assertEquals(column1.isNull(i), false); } assertEquals(createVarcharType(20).getSlice(column1, 0), utf8Slice("hello")); assertEquals(createVarcharType(20).getSlice(column1, 1), utf8Slice("bye")); Block column2 = reader.readBlock(arrayType, 2); assertEquals(column2.getPositionCount(), 2); for (int i = 0; i < 2; i++) { assertEquals(column2.isNull(i), false); } assertTrue(arrayBlocksEqual(BIGINT, arrayType.getObject(column2, 0), arrayBlockOf(BIGINT, 1, 2))); assertTrue(arrayBlocksEqual(BIGINT, arrayType.getObject(column2, 1), arrayBlockOf(BIGINT, 5, 6))); Block column3 = reader.readBlock(mapType, 3); assertEquals(column3.getPositionCount(), 2); for (int i = 0; i < 2; i++) { assertEquals(column3.isNull(i), false); } assertTrue(mapBlocksEqual(createVarcharType(5), BOOLEAN, arrayType.getObject(column3, 0), mapBlockOf(createVarcharType(5), BOOLEAN, "k1", true))); assertTrue(mapBlocksEqual(createVarcharType(5), BOOLEAN, arrayType.getObject(column3, 1), mapBlockOf(createVarcharType(5), BOOLEAN, "k3", true))); Block column4 = reader.readBlock(arrayOfArrayType, 4); assertEquals(column4.getPositionCount(), 2); for (int i = 0; i < 2; i++) { assertEquals(column4.isNull(i), false); } assertTrue(arrayBlocksEqual(arrayType, arrayOfArrayType.getObject(column4, 0), arrayBlockOf(arrayType, arrayBlockOf(BIGINT, 5)))); assertTrue(arrayBlocksEqual(arrayType, arrayOfArrayType.getObject(column4, 1), arrayBlockOf(arrayType, arrayBlockOf(BIGINT, 7)))); assertEquals(reader.nextBatch(), -1); OrcFileMetadata orcFileMetadata = METADATA_CODEC.fromJson(reader.getUserMetadata().get(OrcFileMetadata.KEY).getBytes()); assertEquals(orcFileMetadata, new OrcFileMetadata(ImmutableMap.<Long, TypeSignature>builder() .put(3L, BIGINT.getTypeSignature()) .put(7L, createVarcharType(20).getTypeSignature()) .put(9L, arrayType.getTypeSignature()) .put(10L, mapType.getTypeSignature()) .put(11L, arrayOfArrayType.getTypeSignature()) .build() )); } } @Test public void testRewriteWithoutMetadata() throws Exception { List<Long> columnIds = ImmutableList.of(3L, 7L); List<Type> columnTypes = ImmutableList.of(BIGINT, createVarcharType(20)); File file = new File(temporary, randomUUID().toString()); try (OrcFileWriter writer = new OrcFileWriter(columnIds, columnTypes, file, false)) { List<Page> pages = rowPagesBuilder(columnTypes) .row(123L, "hello") .row(777L, "sky") .build(); writer.appendPages(pages); } try (OrcDataSource dataSource = fileOrcDataSource(file)) { OrcRecordReader reader = createReader(dataSource, columnIds, columnTypes); assertEquals(reader.getReaderRowCount(), 2); assertEquals(reader.getFileRowCount(), 2); assertEquals(reader.getSplitLength(), file.length()); assertEquals(reader.nextBatch(), 2); Block column0 = reader.readBlock(BIGINT, 0); assertEquals(column0.getPositionCount(), 2); for (int i = 0; i < 2; i++) { assertEquals(column0.isNull(i), false); } assertEquals(BIGINT.getLong(column0, 0), 123L); assertEquals(BIGINT.getLong(column0, 1), 777L); Block column1 = reader.readBlock(createVarcharType(20), 1); assertEquals(column1.getPositionCount(), 2); for (int i = 0; i < 2; i++) { assertEquals(column1.isNull(i), false); } assertEquals(createVarcharType(20).getSlice(column1, 0), utf8Slice("hello")); assertEquals(createVarcharType(20).getSlice(column1, 1), utf8Slice("sky")); assertFalse(reader.getUserMetadata().containsKey(OrcFileMetadata.KEY)); } BitSet rowsToDelete = new BitSet(5); rowsToDelete.set(1); File newFile = new File(temporary, randomUUID().toString()); OrcFileInfo info = OrcFileRewriter.rewrite(file, newFile, rowsToDelete); assertEquals(info.getRowCount(), 1); assertEquals(info.getUncompressedSize(), 13); try (OrcDataSource dataSource = fileOrcDataSource(newFile)) { OrcRecordReader reader = createReader(dataSource, columnIds, columnTypes); assertEquals(reader.getReaderRowCount(), 1); assertEquals(reader.getFileRowCount(), 1); assertEquals(reader.getSplitLength(), newFile.length()); assertEquals(reader.nextBatch(), 1); Block column0 = reader.readBlock(BIGINT, 0); assertEquals(column0.getPositionCount(), 1); assertEquals(column0.isNull(0), false); assertEquals(BIGINT.getLong(column0, 0), 123L); Block column1 = reader.readBlock(createVarcharType(20), 1); assertEquals(column1.getPositionCount(), 1); assertEquals(column1.isNull(0), false); assertEquals(createVarcharType(20).getSlice(column1, 0), utf8Slice("hello")); assertFalse(reader.getUserMetadata().containsKey(OrcFileMetadata.KEY)); } } @Test public void testRewriteAllRowsDeleted() throws Exception { List<Long> columnIds = ImmutableList.of(3L); List<Type> columnTypes = ImmutableList.of(BIGINT); File file = new File(temporary, randomUUID().toString()); try (OrcFileWriter writer = new OrcFileWriter(columnIds, columnTypes, file)) { writer.appendPages(rowPagesBuilder(columnTypes).row(123L).row(456L).build()); } BitSet rowsToDelete = new BitSet(); rowsToDelete.set(0); rowsToDelete.set(1); File newFile = new File(temporary, randomUUID().toString()); OrcFileInfo info = OrcFileRewriter.rewrite(file, newFile, rowsToDelete); assertEquals(info.getRowCount(), 0); assertEquals(info.getUncompressedSize(), 0); assertFalse(newFile.exists()); } @Test public void testRewriteNoRowsDeleted() throws Exception { List<Long> columnIds = ImmutableList.of(3L); List<Type> columnTypes = ImmutableList.of(BIGINT); File file = new File(temporary, randomUUID().toString()); try (OrcFileWriter writer = new OrcFileWriter(columnIds, columnTypes, file)) { writer.appendPages(rowPagesBuilder(columnTypes).row(123L).row(456L).build()); } BitSet rowsToDelete = new BitSet(); File newFile = new File(temporary, randomUUID().toString()); OrcFileInfo info = OrcFileRewriter.rewrite(file, newFile, rowsToDelete); assertEquals(info.getRowCount(), 2); assertEquals(info.getUncompressedSize(), 16); assertEquals(readAllBytes(newFile.toPath()), readAllBytes(file.toPath())); } @Test public void testUncompressedSize() throws Exception { List<Long> columnIds = ImmutableList.of(1L, 2L, 3L, 4L, 5L); List<Type> columnTypes = ImmutableList.of(BOOLEAN, BIGINT, DOUBLE, createVarcharType(10), VARBINARY); File file = new File(temporary, randomUUID().toString()); try (OrcFileWriter writer = new OrcFileWriter(columnIds, columnTypes, file)) { List<Page> pages = rowPagesBuilder(columnTypes) .row(true, 123L, 98.7, "hello", utf8Slice("abc")) .row(false, 456L, 65.4, "world", utf8Slice("xyz")) .row(null, null, null, null, null) .build(); writer.appendPages(pages); } File newFile = new File(temporary, randomUUID().toString()); OrcFileInfo info = OrcFileRewriter.rewrite(file, newFile, new BitSet()); assertEquals(info.getRowCount(), 3); assertEquals(info.getUncompressedSize(), 55); } }