TestParquetFileWriter.java example

Explorer
pbase-master
/* 
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package parquet.hadoop;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.junit.Test;
import parquet.Log;
import parquet.bytes.BytesInput;
import parquet.column.ColumnDescriptor;
import parquet.column.Encoding;
import parquet.column.page.DataPage;
import parquet.column.page.DataPageV1;
import parquet.column.page.PageReadStore;
import parquet.column.page.PageReader;
import parquet.column.statistics.BinaryStatistics;
import parquet.column.statistics.LongStatistics;
import parquet.format.Statistics;
import parquet.hadoop.metadata.*;
import parquet.hadoop.util.HiddenFileFilter;
import parquet.io.api.Binary;
import parquet.schema.MessageType;
import parquet.schema.MessageTypeParser;
import parquet.schema.PrimitiveType;
import parquet.schema.PrimitiveType.PrimitiveTypeName;

import java.io.File;
import java.io.IOException;
import java.util.*;

import static org.junit.Assert.*;
import static parquet.column.Encoding.BIT_PACKED;
import static parquet.column.Encoding.PLAIN;
import static parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY;
import static parquet.schema.Type.Repetition.*;
import static parquet.hadoop.TestUtils.enforceEmptyDir;

import parquet.example.data.Group;
import parquet.example.data.simple.SimpleGroup;

import parquet.hadoop.example.GroupWriteSupport;

public class TestParquetFileWriter {
    private static final Log LOG = Log.getLog(TestParquetFileWriter.class);
    private String writeSchema;

    @Test
    public void testWriteMode() throws Exception {
        File testDir = new File("target/test/TestParquetFileWriter/");
        testDir.mkdirs();
        File testFile = new File(testDir, "testParquetFile");
        testFile = testFile.getAbsoluteFile();
        testFile.createNewFile();
        MessageType schema = MessageTypeParser.parseMessageType(
                "message m { required group a {required binary b;} required group "
                        + "c { required int64 d; }}");
        Configuration conf = new Configuration();

        ParquetFileWriter writer = null;
        boolean exceptionThrown = false;
        Path path = new Path(testFile.toURI());
        try {
            writer = new ParquetFileWriter(conf, schema, path,
                    ParquetFileWriter.Mode.CREATE);
        } catch (IOException ioe1) {
            exceptionThrown = true;
        }
        assertTrue(exceptionThrown);
        exceptionThrown = false;
        try {
            writer = new ParquetFileWriter(conf, schema, path,
                    ParquetFileWriter.Mode.OVERWRITE);
        } catch (IOException ioe2) {
            exceptionThrown = true;
        }
        assertTrue(!exceptionThrown);
        testFile.delete();
    }

    @Test
    public void testWriteRead() throws Exception {

        File testFile = new File("target/test/TestParquetFileWriter/testParquetFile").getAbsoluteFile();
        testFile.delete();

        Path path = new Path(testFile.toURI());
        Configuration configuration = new Configuration();

        MessageType schema = MessageTypeParser.parseMessageType("message m { required group a {required binary b;} required group c { required int64 d; }}");
        String[] path1 = {"a", "b"};
        ColumnDescriptor c1 = schema.getColumnDescription(path1);
        String[] path2 = {"c", "d"};
        ColumnDescriptor c2 = schema.getColumnDescription(path2);

        byte[] bytes1 = {0, 1, 2, 3};
        byte[] bytes2 = {1, 2, 3, 4};
        byte[] bytes3 = {2, 3, 4, 5};
        byte[] bytes4 = {3, 4, 5, 6};
        CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED;

        BinaryStatistics stats1 = new BinaryStatistics();
        BinaryStatistics stats2 = new BinaryStatistics();

        ParquetFileWriter w = new ParquetFileWriter(configuration, schema, path);
        w.start();
        w.startBlock(3);
        w.startColumn(c1, 5, codec);
        long c1Starts = w.getPos();
        w.writeDataPage(2, 4, BytesInput.from(bytes1), stats1, BIT_PACKED, BIT_PACKED, PLAIN);
        w.writeDataPage(3, 4, BytesInput.from(bytes1), stats1, BIT_PACKED, BIT_PACKED, PLAIN);
        w.endColumn();
        long c1Ends = w.getPos();
        w.startColumn(c2, 6, codec);
        long c2Starts = w.getPos();
        w.writeDataPage(2, 4, BytesInput.from(bytes2), stats2, BIT_PACKED, BIT_PACKED, PLAIN);
        w.writeDataPage(3, 4, BytesInput.from(bytes2), stats2, BIT_PACKED, BIT_PACKED, PLAIN);
        w.writeDataPage(1, 4, BytesInput.from(bytes2), stats2, BIT_PACKED, BIT_PACKED, PLAIN);
        w.endColumn();
        long c2Ends = w.getPos();
        w.endBlock();
        w.startBlock(4);
        w.startColumn(c1, 7, codec);
        w.writeDataPage(7, 4, BytesInput.from(bytes3), stats1, BIT_PACKED, BIT_PACKED, PLAIN);
        w.endColumn();
        w.startColumn(c2, 8, codec);
        w.writeDataPage(8, 4, BytesInput.from(bytes4), stats2, BIT_PACKED, BIT_PACKED, PLAIN);
        w.endColumn();
        w.endBlock();
        w.end(new HashMap<String, String>());

        ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, path);
        assertEquals("footer: " + readFooter, 2, readFooter.getBlocks().size());
        assertEquals(c1Ends - c1Starts, readFooter.getBlocks().get(0).getColumns().get(0).getTotalSize());
        assertEquals(c2Ends - c2Starts, readFooter.getBlocks().get(0).getColumns().get(1).getTotalSize());
        assertEquals(c2Ends - c1Starts, readFooter.getBlocks().get(0).getTotalByteSize());
        HashSet<Encoding> expectedEncoding = new HashSet<Encoding>();
        expectedEncoding.add(PLAIN);
        expectedEncoding.add(BIT_PACKED);
        assertEquals(expectedEncoding, readFooter.getBlocks().get(0).getColumns().get(0).getEncodings());

        { // read first block of col #1
            ParquetFileReader r = new ParquetFileReader(configuration, path, Arrays.asList(readFooter.getBlocks().get(0)), Arrays.asList(schema.getColumnDescription(path1)));
            PageReadStore pages = r.readNextRowGroup();
            assertEquals(3, pages.getRowCount());
            validateContains(schema, pages, path1, 2, BytesInput.from(bytes1));
            validateContains(schema, pages, path1, 3, BytesInput.from(bytes1));
            assertNull(r.readNextRowGroup());
        }

        { // read all blocks of col #1 and #2

            ParquetFileReader r = new ParquetFileReader(configuration, path, readFooter.getBlocks(), Arrays.asList(schema.getColumnDescription(path1), schema.getColumnDescription(path2)));

            PageReadStore pages = r.readNextRowGroup();
            assertEquals(3, pages.getRowCount());
            validateContains(schema, pages, path1, 2, BytesInput.from(bytes1));
            validateContains(schema, pages, path1, 3, BytesInput.from(bytes1));
            validateContains(schema, pages, path2, 2, BytesInput.from(bytes2));
            validateContains(schema, pages, path2, 3, BytesInput.from(bytes2));
            validateContains(schema, pages, path2, 1, BytesInput.from(bytes2));

            pages = r.readNextRowGroup();
            assertEquals(4, pages.getRowCount());

            validateContains(schema, pages, path1, 7, BytesInput.from(bytes3));
            validateContains(schema, pages, path2, 8, BytesInput.from(bytes4));

            assertNull(r.readNextRowGroup());
        }
        PrintFooter.main(new String[]{path.toString()});
    }

    @Test
    public void testConvertToThriftStatistics() throws Exception {
        long[] longArray = new long[]{39L, 99L, 12L, 1000L, 65L, 542L, 2533461316L, -253346131996L, Long.MAX_VALUE, Long.MIN_VALUE};
        LongStatistics parquetMRstats = new LongStatistics();

        for (long l : longArray) {
            parquetMRstats.updateStats(l);
        }
        Statistics thriftStats = parquet.format.converter.ParquetMetadataConverter.toParquetStatistics(parquetMRstats);
        LongStatistics convertedBackStats = (LongStatistics) parquet.format.converter.ParquetMetadataConverter.fromParquetStatistics(thriftStats, PrimitiveTypeName.INT64);

        assertEquals(parquetMRstats.getMax(), convertedBackStats.getMax());
        assertEquals(parquetMRstats.getMin(), convertedBackStats.getMin());
        assertEquals(parquetMRstats.getNumNulls(), convertedBackStats.getNumNulls());
    }

    @Test
    public void testWriteReadStatistics() throws Exception {

        File testFile = new File("target/test/TestParquetFileWriter/testParquetFile").getAbsoluteFile();
        testFile.delete();

        Path path = new Path(testFile.toURI());
        Configuration configuration = new Configuration();

        MessageType schema = MessageTypeParser.parseMessageType("message m { required group a {required binary b;} required group c { required int64 d; }}");
        String[] path1 = {"a", "b"};
        ColumnDescriptor c1 = schema.getColumnDescription(path1);
        String[] path2 = {"c", "d"};
        ColumnDescriptor c2 = schema.getColumnDescription(path2);

        byte[] bytes1 = {0, 1, 2, 3};
        byte[] bytes2 = {1, 2, 3, 4};
        byte[] bytes3 = {2, 3, 4, 5};
        byte[] bytes4 = {3, 4, 5, 6};
        CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED;

        BinaryStatistics statsB1C1P1 = new BinaryStatistics();
        BinaryStatistics statsB1C1P2 = new BinaryStatistics();
        LongStatistics statsB1C2P1 = new LongStatistics();
        LongStatistics statsB1C2P2 = new LongStatistics();
        BinaryStatistics statsB2C1P1 = new BinaryStatistics();
        LongStatistics statsB2C2P1 = new LongStatistics();
        statsB1C1P1.setMinMax(Binary.fromString("s"), Binary.fromString("z"));
        statsB1C1P2.setMinMax(Binary.fromString("a"), Binary.fromString("b"));
        statsB1C2P1.setMinMax(2l, 10l);
        statsB1C2P2.setMinMax(-6l, 4l);
        statsB2C1P1.setMinMax(Binary.fromString("d"), Binary.fromString("e"));
        statsB2C2P1.setMinMax(11l, 122l);

        ParquetFileWriter w = new ParquetFileWriter(configuration, schema, path);
        w.start();
        w.startBlock(3);
        w.startColumn(c1, 5, codec);
        w.writeDataPage(2, 4, BytesInput.from(bytes1), statsB1C1P1, BIT_PACKED, BIT_PACKED, PLAIN);
        w.writeDataPage(3, 4, BytesInput.from(bytes1), statsB1C1P2, BIT_PACKED, BIT_PACKED, PLAIN);
        w.endColumn();
        w.startColumn(c2, 6, codec);
        w.writeDataPage(3, 4, BytesInput.from(bytes2), statsB1C2P1, BIT_PACKED, BIT_PACKED, PLAIN);
        w.writeDataPage(1, 4, BytesInput.from(bytes2), statsB1C2P2, BIT_PACKED, BIT_PACKED, PLAIN);
        w.endColumn();
        w.endBlock();

        w.startBlock(4);
        w.startColumn(c1, 7, codec);
        w.writeDataPage(7, 4, BytesInput.from(bytes3), statsB2C1P1, BIT_PACKED, BIT_PACKED, PLAIN);
        w.endColumn();
        w.startColumn(c2, 8, codec);
        w.writeDataPage(8, 4, BytesInput.from(bytes4), statsB2C2P1, BIT_PACKED, BIT_PACKED, PLAIN);
        w.endColumn();
        w.endBlock();
        w.end(new HashMap<String, String>());

        ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, path);
        for (BlockMetaData block : readFooter.getBlocks()) {
            for (ColumnChunkMetaData col : block.getColumns()) {
                col.getPath();
            }
        }
        // correct statistics
        BinaryStatistics bs1 = new BinaryStatistics();
        bs1.setMinMax(Binary.fromString("a"), Binary.fromString("z"));
        LongStatistics ls1 = new LongStatistics();
        ls1.setMinMax(-6l, 10l);

        BinaryStatistics bs2 = new BinaryStatistics();
        bs2.setMinMax(Binary.fromString("d"), Binary.fromString("e"));
        LongStatistics ls2 = new LongStatistics();
        ls2.setMinMax(11l, 122l);

        { // assert stats are correct for the first block
            BinaryStatistics bsout = (BinaryStatistics) readFooter.getBlocks().get(0).getColumns().get(0).getStatistics();
            String str = new String(bsout.getMaxBytes());
            String str2 = new String(bsout.getMinBytes());

            assertTrue(((BinaryStatistics) readFooter.getBlocks().get(0).getColumns().get(0).getStatistics()).equals(bs1));
            assertTrue(((LongStatistics) readFooter.getBlocks().get(0).getColumns().get(1).getStatistics()).equals(ls1));
        }
        { // assert stats are correct for the second block
            assertTrue(((BinaryStatistics) readFooter.getBlocks().get(1).getColumns().get(0).getStatistics()).equals(bs2));
            assertTrue(((LongStatistics) readFooter.getBlocks().get(1).getColumns().get(1).getStatistics()).equals(ls2));
        }
    }

    @Test
    public void testMetaDataFile() throws Exception {

        File testDir = new File("target/test/TestParquetFileWriter/testMetaDataFileDir").getAbsoluteFile();

        Path testDirPath = new Path(testDir.toURI());
        Configuration configuration = new Configuration();

        final FileSystem fs = testDirPath.getFileSystem(configuration);
        enforceEmptyDir(configuration, testDirPath);

        MessageType schema = MessageTypeParser.parseMessageType("message m { required group a {required binary b;} required group c { required int64 d; }}");
        createFile(configuration, new Path(testDirPath, "part0"), schema);
        createFile(configuration, new Path(testDirPath, "part1"), schema);
        createFile(configuration, new Path(testDirPath, "part2"), schema);

        FileStatus outputStatus = fs.getFileStatus(testDirPath);
        List<Footer> footers = ParquetFileReader.readFooters(configuration, outputStatus, false);
        validateFooters(footers);
        ParquetFileWriter.writeMetadataFile(configuration, testDirPath, footers);

        footers = ParquetFileReader.readFooters(configuration, outputStatus, false);
        validateFooters(footers);
        footers = ParquetFileReader.readFooters(configuration, fs.getFileStatus(new Path(testDirPath, "part0")), false);
        assertEquals(1, footers.size());

        final FileStatus metadataFile = fs.getFileStatus(new Path(testDirPath, ParquetFileWriter.PARQUET_METADATA_FILE));
        final FileStatus metadataFileLight = fs.getFileStatus(new Path(testDirPath, ParquetFileWriter.PARQUET_COMMON_METADATA_FILE));
        final List<Footer> metadata = ParquetFileReader.readSummaryFile(configuration, metadataFile);

        validateFooters(metadata);

        footers = ParquetFileReader.readAllFootersInParallelUsingSummaryFiles(configuration, Arrays.asList(fs.listStatus(testDirPath, HiddenFileFilter.INSTANCE)), false);
        validateFooters(footers);

        fs.delete(metadataFile.getPath(), false);
        fs.delete(metadataFileLight.getPath(), false);

        footers = ParquetFileReader.readAllFootersInParallelUsingSummaryFiles(configuration, Arrays.asList(fs.listStatus(testDirPath)), false);
        validateFooters(footers);

    }

    @Test
    public void testWriteReadStatisticsAllNulls() throws Exception {

        File testFile = new File("target/test/TestParquetFileWriter/testParquetFile").getAbsoluteFile();
        testFile.delete();

        writeSchema = "message example {\n" +
                "required binary content;\n" +
                "}";

        Path path = new Path(testFile.toURI());

        MessageType schema = MessageTypeParser.parseMessageType(writeSchema);
        Configuration configuration = new Configuration();
        GroupWriteSupport.setSchema(schema, configuration);

        ParquetWriter<Group> writer = new ParquetWriter<Group>(path, configuration, new GroupWriteSupport(null));

        Group r1 = new SimpleGroup(schema);
        writer.write(r1);
        writer.close();

        ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, path);

        // assert the statistics object is not empty
        assertTrue((readFooter.getBlocks().get(0).getColumns().get(0).getStatistics().isEmpty()) == false);
        // assert the number of nulls are correct for the first block
        assertEquals(1, (readFooter.getBlocks().get(0).getColumns().get(0).getStatistics().getNumNulls()));
    }

    private void validateFooters(final List<Footer> metadata) {
        LOG.debug(metadata);
        assertEquals(String.valueOf(metadata), 3, metadata.size());
        for (Footer footer : metadata) {
            final File file = new File(footer.getFile().toUri());
            assertTrue(file.getName(), file.getName().startsWith("part"));
            assertTrue(file.getPath(), file.exists());
            final ParquetMetadata parquetMetadata = footer.getParquetMetadata();
            assertEquals(2, parquetMetadata.getBlocks().size());
            final Map<String, String> keyValueMetaData = parquetMetadata.getFileMetaData().getKeyValueMetaData();
            assertEquals("bar", keyValueMetaData.get("foo"));
            assertEquals(footer.getFile().getName(), keyValueMetaData.get(footer.getFile().getName()));
        }
    }


    private void createFile(Configuration configuration, Path path, MessageType schema) throws IOException {
        String[] path1 = {"a", "b"};
        ColumnDescriptor c1 = schema.getColumnDescription(path1);
        String[] path2 = {"c", "d"};
        ColumnDescriptor c2 = schema.getColumnDescription(path2);

        byte[] bytes1 = {0, 1, 2, 3};
        byte[] bytes2 = {1, 2, 3, 4};
        byte[] bytes3 = {2, 3, 4, 5};
        byte[] bytes4 = {3, 4, 5, 6};
        CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED;

        BinaryStatistics stats1 = new BinaryStatistics();
        BinaryStatistics stats2 = new BinaryStatistics();

        ParquetFileWriter w = new ParquetFileWriter(configuration, schema, path);
        w.start();
        w.startBlock(3);
        w.startColumn(c1, 5, codec);
        w.writeDataPage(2, 4, BytesInput.from(bytes1), stats1, BIT_PACKED, BIT_PACKED, PLAIN);
        w.writeDataPage(3, 4, BytesInput.from(bytes1), stats1, BIT_PACKED, BIT_PACKED, PLAIN);
        w.endColumn();
        w.startColumn(c2, 6, codec);
        w.writeDataPage(2, 4, BytesInput.from(bytes2), stats2, BIT_PACKED, BIT_PACKED, PLAIN);
        w.writeDataPage(3, 4, BytesInput.from(bytes2), stats2, BIT_PACKED, BIT_PACKED, PLAIN);
        w.writeDataPage(1, 4, BytesInput.from(bytes2), stats2, BIT_PACKED, BIT_PACKED, PLAIN);
        w.endColumn();
        w.endBlock();
        w.startBlock(4);
        w.startColumn(c1, 7, codec);
        w.writeDataPage(7, 4, BytesInput.from(bytes3), stats1, BIT_PACKED, BIT_PACKED, PLAIN);
        w.endColumn();
        w.startColumn(c2, 8, codec);
        w.writeDataPage(8, 4, BytesInput.from(bytes4), stats2, BIT_PACKED, BIT_PACKED, PLAIN);
        w.endColumn();
        w.endBlock();
        final HashMap<String, String> extraMetaData = new HashMap<String, String>();
        extraMetaData.put("foo", "bar");
        extraMetaData.put(path.getName(), path.getName());
        w.end(extraMetaData);
    }

    private void validateContains(MessageType schema, PageReadStore pages, String[] path, int values, BytesInput bytes) throws IOException {
        PageReader pageReader = pages.getPageReader(schema.getColumnDescription(path));
        DataPage page = pageReader.readPage();
        assertEquals(values, page.getValueCount());
        assertArrayEquals(bytes.toByteArray(), ((DataPageV1) page).getBytes().toByteArray());
    }

    @Test
    public void testMergeMetadata() {
        FileMetaData md1 = new FileMetaData(
                new MessageType("root1",
                        new PrimitiveType(REPEATED, BINARY, "a"),
                        new PrimitiveType(OPTIONAL, BINARY, "b")),
                new HashMap<String, String>(), "test");
        FileMetaData md2 = new FileMetaData(
                new MessageType("root2",
                        new PrimitiveType(REQUIRED, BINARY, "c")),
                new HashMap<String, String>(), "test2");
        GlobalMetaData merged = ParquetFileWriter.mergeInto(md2, ParquetFileWriter.mergeInto(md1, null));
        assertEquals(
                merged.getSchema(),
                new MessageType("root1",
                        new PrimitiveType(REPEATED, BINARY, "a"),
                        new PrimitiveType(OPTIONAL, BINARY, "b"),
                        new PrimitiveType(REQUIRED, BINARY, "c"))
        );

    }

    @Test
    public void testMergeFooters() {
        List<BlockMetaData> oneBlocks = new ArrayList<BlockMetaData>();
        oneBlocks.add(new BlockMetaData());
        oneBlocks.add(new BlockMetaData());
        List<BlockMetaData> twoBlocks = new ArrayList<BlockMetaData>();
        twoBlocks.add(new BlockMetaData());
        List<BlockMetaData> expected = new ArrayList<BlockMetaData>();
        expected.addAll(oneBlocks);
        expected.addAll(twoBlocks);

        Footer one = new Footer(new Path("file:/tmp/output/one.parquet"),
                new ParquetMetadata(new FileMetaData(
                        new MessageType("root1",
                                new PrimitiveType(REPEATED, BINARY, "a"),
                                new PrimitiveType(OPTIONAL, BINARY, "b")),
                        new HashMap<String, String>(), "test"),
                        oneBlocks));

        Footer two = new Footer(new Path("/tmp/output/two.parquet"),
                new ParquetMetadata(new FileMetaData(
                        new MessageType("root2",
                                new PrimitiveType(REQUIRED, BINARY, "c")),
                        new HashMap<String, String>(), "test2"),
                        twoBlocks));

        List<Footer> footers = new ArrayList<Footer>();
        footers.add(one);
        footers.add(two);

        ParquetMetadata merged = ParquetFileWriter.mergeFooters(
                new Path("/tmp"), footers);

        assertEquals(
                new MessageType("root1",
                        new PrimitiveType(REPEATED, BINARY, "a"),
                        new PrimitiveType(OPTIONAL, BINARY, "b"),
                        new PrimitiveType(REQUIRED, BINARY, "c")),
                merged.getFileMetaData().getSchema());

        assertEquals("Should have all blocks", expected, merged.getBlocks());
    }

}