/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package parquet.format.converter; import static java.util.Collections.emptyList; import static org.junit.Assert.assertEquals; import static org.junit.Assert.fail; import static parquet.format.CompressionCodec.UNCOMPRESSED; import static parquet.format.Type.INT32; import static parquet.format.Util.readPageHeader; import static parquet.format.Util.writePageHeader; import static parquet.format.converter.ParquetMetadataConverter.filterFileMetaData; import static parquet.format.converter.ParquetMetadataConverter.getOffset; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; import java.util.Random; import java.util.Set; import java.util.TreeSet; import org.junit.Assert; import org.junit.Test; import parquet.column.Encoding; import parquet.example.Paper; import parquet.format.ColumnChunk; import parquet.format.ColumnMetaData; import parquet.format.ConvertedType; import parquet.format.FieldRepetitionType; import parquet.format.FileMetaData; import parquet.format.PageHeader; import parquet.format.PageType; import parquet.format.RowGroup; import parquet.format.SchemaElement; import parquet.format.Type; import parquet.schema.MessageType; import parquet.schema.OriginalType; import parquet.schema.PrimitiveType.PrimitiveTypeName; import parquet.schema.Type.Repetition; import parquet.schema.Types; import com.google.common.collect.Lists; public class TestParquetMetadataConverter { @Test public void testPageHeader() throws IOException { ByteArrayOutputStream out = new ByteArrayOutputStream(); PageType type = PageType.DATA_PAGE; int compSize = 10; int uncSize = 20; PageHeader pageHeader = new PageHeader(type, uncSize, compSize); writePageHeader(pageHeader, out); PageHeader readPageHeader = readPageHeader(new ByteArrayInputStream(out.toByteArray())); assertEquals(pageHeader, readPageHeader); } @Test public void testSchemaConverter() { ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter(); List<SchemaElement> parquetSchema = parquetMetadataConverter.toParquetSchema(Paper.schema); MessageType schema = parquetMetadataConverter.fromParquetSchema(parquetSchema); assertEquals(Paper.schema, schema); } @Test public void testSchemaConverterDecimal() { ParquetMetadataConverter converter = new ParquetMetadataConverter(); List<SchemaElement> schemaElements = converter.toParquetSchema( Types.buildMessage() .required(PrimitiveTypeName.BINARY) .as(OriginalType.DECIMAL).precision(9).scale(2) .named("aBinaryDecimal") .optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(4) .as(OriginalType.DECIMAL).precision(9).scale(2) .named("aFixedDecimal") .named("Message") ); List<SchemaElement> expected = Lists.newArrayList( new SchemaElement("Message").setNum_children(2), new SchemaElement("aBinaryDecimal") .setRepetition_type(FieldRepetitionType.REQUIRED) .setType(Type.BYTE_ARRAY) .setConverted_type(ConvertedType.DECIMAL) .setPrecision(9).setScale(2), new SchemaElement("aFixedDecimal") .setRepetition_type(FieldRepetitionType.OPTIONAL) .setType(Type.FIXED_LEN_BYTE_ARRAY) .setType_length(4) .setConverted_type(ConvertedType.DECIMAL) .setPrecision(9).setScale(2) ); Assert.assertEquals(expected, schemaElements); } @Test public void testEnumEquivalence() { ParquetMetadataConverter c = new ParquetMetadataConverter(); for (Encoding encoding : Encoding.values()) { assertEquals(encoding, c.getEncoding(c.getEncoding(encoding))); } for (parquet.format.Encoding encoding : parquet.format.Encoding.values()) { assertEquals(encoding, c.getEncoding(c.getEncoding(encoding))); } for (Repetition repetition : Repetition.values()) { assertEquals(repetition, c.fromParquetRepetition(c.toParquetRepetition(repetition))); } for (FieldRepetitionType repetition : FieldRepetitionType.values()) { assertEquals(repetition, c.toParquetRepetition(c.fromParquetRepetition(repetition))); } for (PrimitiveTypeName primitiveTypeName : PrimitiveTypeName.values()) { assertEquals(primitiveTypeName, c.getPrimitive(c.getType(primitiveTypeName))); } for (Type type : Type.values()) { assertEquals(type, c.getType(c.getPrimitive(type))); } for (OriginalType original : OriginalType.values()) { assertEquals(original, c.getOriginalType(c.getConvertedType(original))); } for (ConvertedType converted : ConvertedType.values()) { assertEquals(converted, c.getConvertedType(c.getOriginalType(converted))); } } private FileMetaData metadata(long... sizes) { List<SchemaElement> schema = emptyList(); List<RowGroup> rowGroups = new ArrayList<RowGroup>(); long offset = 0; for (long size : sizes) { ColumnChunk columnChunk = new ColumnChunk(offset); columnChunk.setMeta_data(new ColumnMetaData( INT32, Collections.<parquet.format.Encoding>emptyList(), Collections.<String>emptyList(), UNCOMPRESSED, 10l, size * 2, size, offset)); rowGroups.add(new RowGroup(Arrays.asList(columnChunk), size, 1)); offset += size; } return new FileMetaData(1, schema, sizes.length, rowGroups); } private FileMetaData filter(FileMetaData md, long start, long end) { return filterFileMetaData(new FileMetaData(md), new ParquetMetadataConverter.RangeMetadataFilter(start, end)); } private void verifyMD(FileMetaData md, long... offsets) { assertEquals(offsets.length, md.row_groups.size()); for (int i = 0; i < offsets.length; i++) { long offset = offsets[i]; RowGroup rowGroup = md.getRow_groups().get(i); assertEquals(offset, getOffset(rowGroup)); } } /** * verifies that splits will end up being a partition of the rowgroup * they are all found only once * * @param md * @param splitWidth */ private void verifyAllFilters(FileMetaData md, long splitWidth) { Set<Long> offsetsFound = new TreeSet<Long>(); for (long start = 0; start < fileSize(md); start += splitWidth) { FileMetaData filtered = filter(md, start, start + splitWidth); for (RowGroup rg : filtered.getRow_groups()) { long o = getOffset(rg); if (offsetsFound.contains(o)) { fail("found the offset twice: " + o); } else { offsetsFound.add(o); } } } if (offsetsFound.size() != md.row_groups.size()) { fail("missing row groups, " + "found: " + offsetsFound + "\nexpected " + md.getRow_groups()); } } private long fileSize(FileMetaData md) { long size = 0; for (RowGroup rg : md.getRow_groups()) { size += rg.total_byte_size; } return size; } @Test public void testFilterMetaData() { verifyMD(filter(metadata(50, 50, 50), 0, 50), 0); verifyMD(filter(metadata(50, 50, 50), 50, 100), 50); verifyMD(filter(metadata(50, 50, 50), 100, 150), 100); // picks up first RG verifyMD(filter(metadata(50, 50, 50), 25, 75), 0); // picks up no RG verifyMD(filter(metadata(50, 50, 50), 26, 75)); // picks up second RG verifyMD(filter(metadata(50, 50, 50), 26, 76), 50); verifyAllFilters(metadata(50, 50, 50), 10); verifyAllFilters(metadata(50, 50, 50), 51); verifyAllFilters(metadata(50, 50, 50), 25); // corner cases are in the middle verifyAllFilters(metadata(50, 50, 50), 24); verifyAllFilters(metadata(50, 50, 50), 26); verifyAllFilters(metadata(50, 50, 50), 110); verifyAllFilters(metadata(10, 50, 500), 110); verifyAllFilters(metadata(10, 50, 500), 10); verifyAllFilters(metadata(10, 50, 500), 600); verifyAllFilters(metadata(11, 9, 10), 10); verifyAllFilters(metadata(11, 9, 10), 9); verifyAllFilters(metadata(11, 9, 10), 8); } @Test public void randomTestFilterMetaData() { // randomized property based testing // if it fails add the case above Random random = new Random(System.currentTimeMillis()); for (int j = 0; j < 100; j++) { long[] rgs = new long[random.nextInt(50)]; for (int i = 0; i < rgs.length; i++) { rgs[i] = random.nextInt(10000) + 1; // No empty row groups } int splitSize = random.nextInt(10000); try { verifyAllFilters(metadata(rgs), splitSize); } catch (AssertionError e) { throw (AssertionError) new AssertionError("fail verifyAllFilters(metadata(" + Arrays.toString(rgs) + "), " + splitSize + ")").initCause(e); } } } }