ParquetFileWriter.java example

Explorer
pbase-master
/* 
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package parquet.hadoop;

import static parquet.Log.DEBUG;
import static parquet.format.Util.writeFileMetaData;

import java.io.IOException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import parquet.Log;
import parquet.Version;
import parquet.bytes.BytesInput;
import parquet.bytes.BytesUtils;
import parquet.column.ColumnDescriptor;
import parquet.column.page.DictionaryPage;
import parquet.column.statistics.Statistics;
import parquet.hadoop.metadata.ColumnPath;
import parquet.format.converter.ParquetMetadataConverter;
import parquet.hadoop.metadata.BlockMetaData;
import parquet.hadoop.metadata.ColumnChunkMetaData;
import parquet.hadoop.metadata.CompressionCodecName;
import parquet.hadoop.metadata.FileMetaData;
import parquet.hadoop.metadata.GlobalMetaData;
import parquet.hadoop.metadata.ParquetMetadata;
import parquet.io.ParquetEncodingException;
import parquet.schema.MessageType;
import parquet.schema.PrimitiveType.PrimitiveTypeName;

/**
 * Internal implementation of the Parquet file writer as a block container
 *
 * @author Julien Le Dem
 */
public class ParquetFileWriter {
    private static final Log LOG = Log.getLog(ParquetFileWriter.class);

    public static final String PARQUET_METADATA_FILE = "_metadata";
    public static final String PARQUET_COMMON_METADATA_FILE = "_common_metadata";
    public static final byte[] MAGIC = "PAR1".getBytes(Charset.forName("ASCII"));
    public static final int CURRENT_VERSION = 1;

    // File creation modes
    public static enum Mode {
        CREATE,
        OVERWRITE
    }

    private static final ParquetMetadataConverter metadataConverter = new ParquetMetadataConverter();

    private final MessageType schema;
    private final FSDataOutputStream out;
    private BlockMetaData currentBlock;
    private ColumnChunkMetaData currentColumn;
    private long currentRecordCount;
    private List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();
    private long uncompressedLength;
    private long compressedLength;
    private Set<parquet.column.Encoding> currentEncodings;

    private CompressionCodecName currentChunkCodec;
    private ColumnPath currentChunkPath;
    private PrimitiveTypeName currentChunkType;
    private long currentChunkFirstDataPage;
    private long currentChunkDictionaryPageOffset;
    private long currentChunkValueCount;

    private Statistics currentStatistics;

    /**
     * Captures the order in which methods should be called
     *
     * @author Julien Le Dem
     */
    private enum STATE {
        NOT_STARTED {
            STATE start() {
                return STARTED;
            }
        },
        STARTED {
            STATE startBlock() {
                return BLOCK;
            }

            STATE end() {
                return ENDED;
            }
        },
        BLOCK {
            STATE startColumn() {
                return COLUMN;
            }

            STATE endBlock() {
                return STARTED;
            }
        },
        COLUMN {
            STATE endColumn() {
                return BLOCK;
            }

            ;

            STATE write() {
                return this;
            }
        },
        ENDED;

        STATE start() throws IOException {
            return error();
        }

        STATE startBlock() throws IOException {
            return error();
        }

        STATE startColumn() throws IOException {
            return error();
        }

        STATE write() throws IOException {
            return error();
        }

        STATE endColumn() throws IOException {
            return error();
        }

        STATE endBlock() throws IOException {
            return error();
        }

        STATE end() throws IOException {
            return error();
        }

        private final STATE error() throws IOException {
            throw new IOException("The file being written is in an invalid state. Probably caused by an error thrown previously. Current state: " + this.name());
        }
    }

    private STATE state = STATE.NOT_STARTED;

    /**
     * @param configuration Hadoop configuration
     * @param schema        the schema of the data
     * @param file          the file to write to
     * @throws IOException if the file can not be created
     */
    public ParquetFileWriter(Configuration configuration, MessageType schema,
                             Path file) throws IOException {
        this(configuration, schema, file, Mode.CREATE);
    }

    /**
     * @param configuration Hadoop configuration
     * @param schema        the schema of the data
     * @param file          the file to write to
     * @param mode          file creation mode
     * @throws IOException if the file can not be created
     */
    public ParquetFileWriter(Configuration configuration, MessageType schema,
                             Path file, Mode mode) throws IOException {
        super();
        this.schema = schema;
        FileSystem fs = file.getFileSystem(configuration);
        boolean overwriteFlag = (mode == Mode.OVERWRITE);

        this.out = fs.create(file, overwriteFlag);
    }

    /**
     * start the file
     *
     * @throws IOException
     */
    public void start() throws IOException {
        state = state.start();
        if (DEBUG) LOG.debug(out.getPos() + ": start");
        out.write(MAGIC);
    }

    /**
     * start a block
     *
     * @param recordCount the record count in this block
     * @throws IOException
     */
    public void startBlock(long recordCount) throws IOException {
        state = state.startBlock();
        if (DEBUG) LOG.debug(out.getPos() + ": start block");
//    out.write(MAGIC); // TODO: add a magic delimiter
        currentBlock = new BlockMetaData();
        currentRecordCount = recordCount;
    }

    /**
     * start a column inside a block
     *
     * @param descriptor           the column descriptor
     * @param valueCount           the value count in this column
     * @param statistics           the statistics in this column
     * @param compressionCodecName
     * @throws IOException
     */
    public void startColumn(ColumnDescriptor descriptor,
                            long valueCount,
                            CompressionCodecName compressionCodecName) throws IOException {
        state = state.startColumn();
        if (DEBUG) LOG.debug(out.getPos() + ": start column: " + descriptor + " count=" + valueCount);
        currentEncodings = new HashSet<parquet.column.Encoding>();
        currentChunkPath = ColumnPath.get(descriptor.getPath());
        currentChunkType = descriptor.getType();
        currentChunkCodec = compressionCodecName;
        currentChunkValueCount = valueCount;
        currentChunkFirstDataPage = out.getPos();
        compressedLength = 0;
        uncompressedLength = 0;
        // need to know what type of stats to initialize to
        // better way to do this?
        currentStatistics = Statistics.getStatsBasedOnType(currentChunkType);
    }

    /**
     * writes a dictionary page page
     *
     * @param dictionaryPage the dictionary page
     */
    public void writeDictionaryPage(DictionaryPage dictionaryPage) throws IOException {
        state = state.write();
        if (DEBUG)
            LOG.debug(out.getPos() + ": write dictionary page: " + dictionaryPage.getDictionarySize() + " values");
        currentChunkDictionaryPageOffset = out.getPos();
        int uncompressedSize = dictionaryPage.getUncompressedSize();
        int compressedPageSize = (int) dictionaryPage.getBytes().size(); // TODO: fix casts
        metadataConverter.writeDictionaryPageHeader(
                uncompressedSize,
                compressedPageSize,
                dictionaryPage.getDictionarySize(),
                dictionaryPage.getEncoding(),
                out);
        long headerSize = out.getPos() - currentChunkDictionaryPageOffset;
        this.uncompressedLength += uncompressedSize + headerSize;
        this.compressedLength += compressedPageSize + headerSize;
        if (DEBUG) LOG.debug(out.getPos() + ": write dictionary page content " + compressedPageSize);
        dictionaryPage.getBytes().writeAllTo(out);
        currentEncodings.add(dictionaryPage.getEncoding());
    }


    /**
     * writes a single page
     *
     * @param valueCount           count of values
     * @param uncompressedPageSize the size of the data once uncompressed
     * @param bytes                the compressed data for the page without header
     * @param rlEncoding           encoding of the repetition level
     * @param dlEncoding           encoding of the definition level
     * @param valuesEncoding       encoding of values
     */
    @Deprecated
    public void writeDataPage(
            int valueCount, int uncompressedPageSize,
            BytesInput bytes,
            parquet.column.Encoding rlEncoding,
            parquet.column.Encoding dlEncoding,
            parquet.column.Encoding valuesEncoding) throws IOException {
        state = state.write();
        long beforeHeader = out.getPos();
        if (DEBUG) LOG.debug(beforeHeader + ": write data page: " + valueCount + " values");
        int compressedPageSize = (int) bytes.size();
        metadataConverter.writeDataPageHeader(
                uncompressedPageSize, compressedPageSize,
                valueCount,
                rlEncoding,
                dlEncoding,
                valuesEncoding,
                out);
        long headerSize = out.getPos() - beforeHeader;
        this.uncompressedLength += uncompressedPageSize + headerSize;
        this.compressedLength += compressedPageSize + headerSize;
        if (DEBUG) LOG.debug(out.getPos() + ": write data page content " + compressedPageSize);
        bytes.writeAllTo(out);
        currentEncodings.add(rlEncoding);
        currentEncodings.add(dlEncoding);
        currentEncodings.add(valuesEncoding);
    }

    /**
     * writes a single page
     *
     * @param valueCount           count of values
     * @param uncompressedPageSize the size of the data once uncompressed
     * @param bytes                the compressed data for the page without header
     * @param rlEncoding           encoding of the repetition level
     * @param dlEncoding           encoding of the definition level
     * @param valuesEncoding       encoding of values
     */
    public void writeDataPage(
            int valueCount, int uncompressedPageSize,
            BytesInput bytes,
            Statistics statistics,
            parquet.column.Encoding rlEncoding,
            parquet.column.Encoding dlEncoding,
            parquet.column.Encoding valuesEncoding) throws IOException {
        state = state.write();
        long beforeHeader = out.getPos();
        if (DEBUG) LOG.debug(beforeHeader + ": write data page: " + valueCount + " values");
        int compressedPageSize = (int) bytes.size();
        metadataConverter.writeDataPageHeader(
                uncompressedPageSize, compressedPageSize,
                valueCount,
                statistics,
                rlEncoding,
                dlEncoding,
                valuesEncoding,
                out);
        long headerSize = out.getPos() - beforeHeader;
        this.uncompressedLength += uncompressedPageSize + headerSize;
        this.compressedLength += compressedPageSize + headerSize;
        if (DEBUG) LOG.debug(out.getPos() + ": write data page content " + compressedPageSize);
        bytes.writeAllTo(out);
        currentStatistics.mergeStatistics(statistics);
        currentEncodings.add(rlEncoding);
        currentEncodings.add(dlEncoding);
        currentEncodings.add(valuesEncoding);
    }

    /**
     * writes a number of pages at once
     *
     * @param bytes                     bytes to be written including page headers
     * @param uncompressedTotalPageSize total uncompressed size (without page headers)
     * @param compressedTotalPageSize   total compressed size (without page headers)
     * @throws IOException
     */
    void writeDataPages(BytesInput bytes,
                        long uncompressedTotalPageSize,
                        long compressedTotalPageSize,
                        Statistics totalStats,
                        List<parquet.column.Encoding> encodings) throws IOException {
        state = state.write();
        if (DEBUG) LOG.debug(out.getPos() + ": write data pages");
        long headersSize = bytes.size() - compressedTotalPageSize;
        this.uncompressedLength += uncompressedTotalPageSize + headersSize;
        this.compressedLength += compressedTotalPageSize + headersSize;
        if (DEBUG) LOG.debug(out.getPos() + ": write data pages content");
        bytes.writeAllTo(out);
        currentEncodings.addAll(encodings);
        currentStatistics = totalStats;
    }

    /**
     * end a column (once all rep, def and data have been written)
     *
     * @throws IOException
     */
    public void endColumn() throws IOException {
        state = state.endColumn();
        if (DEBUG) LOG.debug(out.getPos() + ": end column");
        currentBlock.addColumn(ColumnChunkMetaData.get(
                currentChunkPath,
                currentChunkType,
                currentChunkCodec,
                currentEncodings,
                currentStatistics,
                currentChunkFirstDataPage,
                currentChunkDictionaryPageOffset,
                currentChunkValueCount,
                compressedLength,
                uncompressedLength));
        if (DEBUG) LOG.info("ended Column chumk: " + currentColumn);
        currentColumn = null;
        this.currentBlock.setTotalByteSize(currentBlock.getTotalByteSize() + uncompressedLength);
        this.uncompressedLength = 0;
        this.compressedLength = 0;
    }

    /**
     * ends a block once all column chunks have been written
     *
     * @throws IOException
     */
    public void endBlock() throws IOException {
        state = state.endBlock();
        if (DEBUG) LOG.debug(out.getPos() + ": end block");
        currentBlock.setRowCount(currentRecordCount);
        blocks.add(currentBlock);
        currentBlock = null;
    }

    /**
     * ends a file once all blocks have been written.
     * closes the file.
     *
     * @param extraMetaData the extra meta data to write in the footer
     * @throws IOException
     */
    public void end(Map<String, String> extraMetaData) throws IOException {
        state = state.end();
        if (DEBUG) LOG.debug(out.getPos() + ": end");
        ParquetMetadata footer = new ParquetMetadata(new FileMetaData(schema, extraMetaData, Version.FULL_VERSION), blocks);
        serializeFooter(footer, out);
        out.close();
    }

    private static void serializeFooter(ParquetMetadata footer, FSDataOutputStream out) throws IOException {
        long footerIndex = out.getPos();
        parquet.format.FileMetaData parquetMetadata = new ParquetMetadataConverter().toParquetMetadata(CURRENT_VERSION, footer);
        writeFileMetaData(parquetMetadata, out);
        if (DEBUG) LOG.debug(out.getPos() + ": footer length = " + (out.getPos() - footerIndex));
        BytesUtils.writeIntLittleEndian(out, (int) (out.getPos() - footerIndex));
        out.write(MAGIC);
    }

    /**
     * writes a _metadata and _common_metadata file
     *
     * @param configuration the configuration to use to get the FileSystem
     * @param outputPath    the directory to write the _metadata file to
     * @param footers       the list of footers to merge
     * @throws IOException
     */
    public static void writeMetadataFile(Configuration configuration, Path outputPath, List<Footer> footers) throws IOException {
        ParquetMetadata metadataFooter = mergeFooters(outputPath, footers);
        FileSystem fs = outputPath.getFileSystem(configuration);
        outputPath = outputPath.makeQualified(fs);
        writeMetadataFile(outputPath, metadataFooter, fs, PARQUET_METADATA_FILE);
        metadataFooter.getBlocks().clear();
        writeMetadataFile(outputPath, metadataFooter, fs, PARQUET_COMMON_METADATA_FILE);
    }

    private static void writeMetadataFile(Path outputPath, ParquetMetadata metadataFooter, FileSystem fs, String parquetMetadataFile)
            throws IOException {
        Path metaDataPath = new Path(outputPath, parquetMetadataFile);
        FSDataOutputStream metadata = fs.create(metaDataPath);
        metadata.write(MAGIC);
        serializeFooter(metadataFooter, metadata);
        metadata.close();
    }

    static ParquetMetadata mergeFooters(Path root, List<Footer> footers) {
        String rootPath = root.toUri().getPath();
        GlobalMetaData fileMetaData = null;
        List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();
        for (Footer footer : footers) {
            String footerPath = footer.getFile().toUri().getPath();
            if (!footerPath.startsWith(rootPath)) {
                throw new ParquetEncodingException(footerPath + " invalid: all the files must be contained in the root " + root);
            }
            footerPath = footerPath.substring(rootPath.length());
            while (footerPath.startsWith("/")) {
                footerPath = footerPath.substring(1);
            }
            fileMetaData = mergeInto(footer.getParquetMetadata().getFileMetaData(), fileMetaData);
            for (BlockMetaData block : footer.getParquetMetadata().getBlocks()) {
                block.setPath(footerPath);
                blocks.add(block);
            }
        }
        return new ParquetMetadata(fileMetaData.merge(), blocks);
    }

    /**
     * @return the current position in the underlying file
     * @throws IOException
     */
    public long getPos() throws IOException {
        return out.getPos();
    }

    /**
     * Will merge the metadata of all the footers together
     *
     * @param footers the list files footers to merge
     * @return the global meta data for all the footers
     */
    static GlobalMetaData getGlobalMetaData(List<Footer> footers) {
        return getGlobalMetaData(footers, true);
    }

    static GlobalMetaData getGlobalMetaData(List<Footer> footers, boolean strict) {
        GlobalMetaData fileMetaData = null;
        for (Footer footer : footers) {
            ParquetMetadata currentMetadata = footer.getParquetMetadata();
            fileMetaData = mergeInto(currentMetadata.getFileMetaData(), fileMetaData, strict);
        }
        return fileMetaData;
    }

    /**
     * Will return the result of merging toMerge into mergedMetadata
     *
     * @param toMerge        the metadata toMerge
     * @param mergedMetadata the reference metadata to merge into
     * @return the result of the merge
     */
    static GlobalMetaData mergeInto(
            FileMetaData toMerge,
            GlobalMetaData mergedMetadata) {
        return mergeInto(toMerge, mergedMetadata, true);
    }

    static GlobalMetaData mergeInto(
            FileMetaData toMerge,
            GlobalMetaData mergedMetadata,
            boolean strict) {
        MessageType schema = null;
        Map<String, Set<String>> newKeyValues = new HashMap<String, Set<String>>();
        Set<String> createdBy = new HashSet<String>();
        if (mergedMetadata != null) {
            schema = mergedMetadata.getSchema();
            newKeyValues.putAll(mergedMetadata.getKeyValueMetaData());
            createdBy.addAll(mergedMetadata.getCreatedBy());
        }
        if ((schema == null && toMerge.getSchema() != null)
                || (schema != null && !schema.equals(toMerge.getSchema()))) {
            schema = mergeInto(toMerge.getSchema(), schema, strict);
        }
        for (Entry<String, String> entry : toMerge.getKeyValueMetaData().entrySet()) {
            Set<String> values = newKeyValues.get(entry.getKey());
            if (values == null) {
                values = new HashSet<String>();
                newKeyValues.put(entry.getKey(), values);
            }
            values.add(entry.getValue());
        }
        createdBy.add(toMerge.getCreatedBy());
        return new GlobalMetaData(
                schema,
                newKeyValues,
                createdBy);
    }

    /**
     * will return the result of merging toMerge into mergedSchema
     *
     * @param toMerge      the schema to merge into mergedSchema
     * @param mergedSchema the schema to append the fields to
     * @return the resulting schema
     */
    static MessageType mergeInto(MessageType toMerge, MessageType mergedSchema) {
        return mergeInto(toMerge, mergedSchema, true);
    }

    /**
     * will return the result of merging toMerge into mergedSchema
     *
     * @param toMerge      the schema to merge into mergedSchema
     * @param mergedSchema the schema to append the fields to
     * @param strict       should schema primitive types match
     * @return the resulting schema
     */
    static MessageType mergeInto(MessageType toMerge, MessageType mergedSchema, boolean strict) {
        if (mergedSchema == null) {
            return toMerge;
        }

        return mergedSchema.union(toMerge, strict);
    }

}