ParquetWriter.java example

Explorer
pbase-master
/* 
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package parquet.hadoop;

import java.io.Closeable;
import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;

import parquet.column.ParquetProperties;
import parquet.column.ParquetProperties.WriterVersion;
import parquet.hadoop.api.WriteSupport;
import parquet.hadoop.metadata.CompressionCodecName;
import parquet.schema.MessageType;

/**
 * Write records to a Parquet file.
 */
public class ParquetWriter<T> implements Closeable {

    public static final int DEFAULT_BLOCK_SIZE = 128 * 1024 * 1024;
    public static final int DEFAULT_PAGE_SIZE = 1 * 1024 * 1024;
    public static final CompressionCodecName DEFAULT_COMPRESSION_CODEC_NAME =
            CompressionCodecName.UNCOMPRESSED;
    public static final boolean DEFAULT_IS_DICTIONARY_ENABLED = true;
    public static final boolean DEFAULT_IS_VALIDATING_ENABLED = false;
    public static final WriterVersion DEFAULT_WRITER_VERSION =
            WriterVersion.PARQUET_1_0;

    private final InternalParquetRecordWriter<T> writer;

    /**
     * Create a new ParquetWriter.
     * (with dictionary encoding enabled and validation off)
     *
     * @param file                 the file to create
     * @param writeSupport         the implementation to write a record to a RecordConsumer
     * @param compressionCodecName the compression codec to use
     * @param blockSize            the block size threshold
     * @param pageSize             the page size threshold
     * @throws IOException
     * @see #ParquetWriter(Path, WriteSupport, CompressionCodecName, int, int, boolean, boolean)
     */
    public ParquetWriter(Path file, WriteSupport<T> writeSupport, CompressionCodecName compressionCodecName, int blockSize, int pageSize) throws IOException {
        this(file, writeSupport, compressionCodecName, blockSize, pageSize,
                DEFAULT_IS_DICTIONARY_ENABLED, DEFAULT_IS_VALIDATING_ENABLED);
    }

    /**
     * Create a new ParquetWriter.
     *
     * @param file                 the file to create
     * @param writeSupport         the implementation to write a record to a RecordConsumer
     * @param compressionCodecName the compression codec to use
     * @param blockSize            the block size threshold
     * @param pageSize             the page size threshold (both data and dictionary)
     * @param enableDictionary     to turn dictionary encoding on
     * @param validating           to turn on validation using the schema
     * @throws IOException
     * @see #ParquetWriter(Path, WriteSupport, CompressionCodecName, int, int, int, boolean, boolean)
     */
    public ParquetWriter(
            Path file,
            WriteSupport<T> writeSupport,
            CompressionCodecName compressionCodecName,
            int blockSize,
            int pageSize,
            boolean enableDictionary,
            boolean validating) throws IOException {
        this(file, writeSupport, compressionCodecName, blockSize, pageSize, pageSize, enableDictionary, validating);
    }

    /**
     * Create a new ParquetWriter.
     *
     * @param file                 the file to create
     * @param writeSupport         the implementation to write a record to a RecordConsumer
     * @param compressionCodecName the compression codec to use
     * @param blockSize            the block size threshold
     * @param pageSize             the page size threshold
     * @param dictionaryPageSize   the page size threshold for the dictionary pages
     * @param enableDictionary     to turn dictionary encoding on
     * @param validating           to turn on validation using the schema
     * @throws IOException
     * @see #ParquetWriter(Path, WriteSupport, CompressionCodecName, int, int, int, boolean, boolean, WriterVersion)
     */
    public ParquetWriter(
            Path file,
            WriteSupport<T> writeSupport,
            CompressionCodecName compressionCodecName,
            int blockSize,
            int pageSize,
            int dictionaryPageSize,
            boolean enableDictionary,
            boolean validating) throws IOException {
        this(file, writeSupport, compressionCodecName, blockSize, pageSize,
                dictionaryPageSize, enableDictionary, validating,
                DEFAULT_WRITER_VERSION);
    }

    /**
     * Create a new ParquetWriter.
     * <p/>
     * Directly instantiates a Hadoop {@link org.apache.hadoop.conf.Configuration} which reads
     * configuration from the classpath.
     *
     * @param file                 the file to create
     * @param writeSupport         the implementation to write a record to a RecordConsumer
     * @param compressionCodecName the compression codec to use
     * @param blockSize            the block size threshold
     * @param pageSize             the page size threshold
     * @param dictionaryPageSize   the page size threshold for the dictionary pages
     * @param enableDictionary     to turn dictionary encoding on
     * @param validating           to turn on validation using the schema
     * @param writerVersion        version of parquetWriter from {@link ParquetProperties.WriterVersion}
     * @throws IOException
     * @see #ParquetWriter(Path, WriteSupport, CompressionCodecName, int, int, int, boolean, boolean, WriterVersion, Configuration)
     */
    public ParquetWriter(
            Path file,
            WriteSupport<T> writeSupport,
            CompressionCodecName compressionCodecName,
            int blockSize,
            int pageSize,
            int dictionaryPageSize,
            boolean enableDictionary,
            boolean validating,
            WriterVersion writerVersion) throws IOException {
        this(file, writeSupport, compressionCodecName, blockSize, pageSize, dictionaryPageSize, enableDictionary, validating, writerVersion, new Configuration());
    }

    /**
     * Create a new ParquetWriter.
     *
     * @param file                 the file to create
     * @param writeSupport         the implementation to write a record to a RecordConsumer
     * @param compressionCodecName the compression codec to use
     * @param blockSize            the block size threshold
     * @param pageSize             the page size threshold
     * @param dictionaryPageSize   the page size threshold for the dictionary pages
     * @param enableDictionary     to turn dictionary encoding on
     * @param validating           to turn on validation using the schema
     * @param writerVersion        version of parquetWriter from {@link ParquetProperties.WriterVersion}
     * @param conf                 Hadoop configuration to use while accessing the filesystem
     * @throws IOException
     */
    public ParquetWriter(
            Path file,
            WriteSupport<T> writeSupport,
            CompressionCodecName compressionCodecName,
            int blockSize,
            int pageSize,
            int dictionaryPageSize,
            boolean enableDictionary,
            boolean validating,
            WriterVersion writerVersion,
            Configuration conf) throws IOException {
        this(file, ParquetFileWriter.Mode.CREATE, writeSupport,
                compressionCodecName, blockSize, pageSize, dictionaryPageSize,
                enableDictionary, validating, writerVersion, conf);
    }

    /**
     * Create a new ParquetWriter.
     *
     * @param file                 the file to create
     * @param mode                 file creation mode
     * @param writeSupport         the implementation to write a record to a RecordConsumer
     * @param compressionCodecName the compression codec to use
     * @param blockSize            the block size threshold
     * @param pageSize             the page size threshold
     * @param dictionaryPageSize   the page size threshold for the dictionary pages
     * @param enableDictionary     to turn dictionary encoding on
     * @param validating           to turn on validation using the schema
     * @param writerVersion        version of parquetWriter from {@link ParquetProperties.WriterVersion}
     * @param conf                 Hadoop configuration to use while accessing the filesystem
     * @throws IOException
     */
    public ParquetWriter(
            Path file,
            ParquetFileWriter.Mode mode,
            WriteSupport<T> writeSupport,
            CompressionCodecName compressionCodecName,
            int blockSize,
            int pageSize,
            int dictionaryPageSize,
            boolean enableDictionary,
            boolean validating,
            WriterVersion writerVersion,
            Configuration conf) throws IOException {

        WriteSupport.WriteContext writeContext = writeSupport.init(conf);
        MessageType schema = writeContext.getSchema();

        ParquetFileWriter fileWriter = new ParquetFileWriter(conf, schema, file,
                mode);
        fileWriter.start();

        CodecFactory codecFactory = new CodecFactory(conf);
        CodecFactory.BytesCompressor compressor = codecFactory.getCompressor(compressionCodecName, 0);
        this.writer = new InternalParquetRecordWriter<T>(
                fileWriter,
                writeSupport,
                schema,
                writeContext.getExtraMetaData(),
                blockSize,
                pageSize,
                compressor,
                dictionaryPageSize,
                enableDictionary,
                validating,
                writerVersion);
    }

    /**
     * Create a new ParquetWriter.  The default block size is 50 MB.The default
     * page size is 1 MB.  Default compression is no compression. Dictionary encoding is disabled.
     *
     * @param file         the file to create
     * @param writeSupport the implementation to write a record to a RecordConsumer
     * @throws IOException
     */
    public ParquetWriter(Path file, WriteSupport<T> writeSupport) throws IOException {
        this(file, writeSupport, DEFAULT_COMPRESSION_CODEC_NAME, DEFAULT_BLOCK_SIZE, DEFAULT_PAGE_SIZE);
    }

    public ParquetWriter(Path file, Configuration conf, WriteSupport<T> writeSupport) throws IOException {
        this(file,
                writeSupport,
                DEFAULT_COMPRESSION_CODEC_NAME,
                DEFAULT_BLOCK_SIZE,
                DEFAULT_PAGE_SIZE,
                DEFAULT_PAGE_SIZE,
                DEFAULT_IS_DICTIONARY_ENABLED,
                DEFAULT_IS_VALIDATING_ENABLED,
                DEFAULT_WRITER_VERSION,
                conf);
    }

    public void write(T object) throws IOException {
        try {
            writer.write(object);
        } catch (InterruptedException e) {
            throw new IOException(e);
        }
    }

    public void close() throws IOException {
        try {
            writer.close();
        } catch (InterruptedException e) {
            throw new IOException(e);
        }
    }
}