/**
* Copyright 2011-2017 Asakusa Framework Team.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.asakusafw.directio.hive.parquet;
import java.io.IOException;
import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
import java.text.MessageFormat;
import java.util.Arrays;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import com.asakusafw.directio.hive.serde.DataModelDescriptor;
import com.asakusafw.runtime.directio.Counter;
import com.asakusafw.runtime.io.ModelOutput;
import parquet.column.ParquetProperties;
import parquet.column.ParquetProperties.WriterVersion;
import parquet.hadoop.ParquetWriter;
import parquet.hadoop.api.WriteSupport;
import parquet.hadoop.metadata.CompressionCodecName;
/**
* An implementation of {@link ModelOutput} for writing Parquet files.
* @param <T> the data model type
* @since 0.7.0
* @version 0.7.2
*/
public class ParquetFileOutput<T> implements ModelOutput<T> {
static final Log LOG = LogFactory.getLog(ParquetFileOutput.class);
static final ParquetVersion LIBRARY_VERSION = ParquetVersion.getSupportedVersion();
private final DataModelDescriptor descriptor;
private final Configuration configuration;
private final DataModelWriteSupport writeSupport;
private final Path path;
private final ParquetFileOutput.Options options;
private final Counter counter;
private ParquetWriter<T> currentWriter;
/**
* Creates a new instance.
* @param descriptor the target data model descriptor
* @param configuration the hadoop configuration
* @param path the path to the target file
* @param options the parquet file output options
* @param counter the current counter
*/
public ParquetFileOutput(
DataModelDescriptor descriptor,
Configuration configuration,
Path path,
ParquetFileOutput.Options options,
Counter counter) {
this.writeSupport = new DataModelWriteSupport(descriptor);
this.descriptor = descriptor;
this.configuration = configuration;
this.path = path;
this.options = options;
this.counter = counter;
}
@Override
public void write(T model) throws IOException {
ParquetWriter<T> writer = prepareWriter();
writer.write(model);
// not sure
counter.add(1);
}
@SuppressWarnings("unchecked")
private ParquetWriter<T> prepareWriter() throws IOException {
ParquetWriter<T> writer = currentWriter;
if (writer == null) {
if (LOG.isInfoEnabled()) {
LOG.info(MessageFormat.format(
Messages.getString("ParquetFileOutput.infoCreate"), //$NON-NLS-1$
descriptor.getDataModelClass().getSimpleName(),
path));
}
Options opts = options;
writer = LIBRARY_VERSION.newInstance(
path,
(WriteSupport<T>) writeSupport,
opts,
configuration);
currentWriter = writer;
}
return writer;
}
@Override
public void close() throws IOException {
if (currentWriter != null) {
currentWriter.close();
}
}
// CHECKSTYLE:OFF RedundantModifierCheck
private static enum ParquetVersion {
// CHECKSTYLE:ON RedundantModifierCheck
// Note: put newer version on top
V_15(10) {
@Override
<T> ParquetWriter<T> newInstance(
Path path,
WriteSupport<T> writeSupport,
ParquetFileOutput.Options options,
Configuration configuration) throws IOException {
return new ParquetWriter<>(
path,
writeSupport,
options.getCompressionCodecName(),
options.getBlockSize(),
options.getDataPageSize(),
options.getDictionaryPageSize(),
options.isEnableDictionary(),
options.isEnableValidation(),
options.getWriterVersion(),
configuration);
}
},
V_13(9) {
@Override
<T> ParquetWriter<T> newInstance(
Path path,
WriteSupport<T> writeSupport,
ParquetFileOutput.Options options,
Configuration configuration) throws IOException {
return new ParquetWriter<>(
path,
writeSupport,
options.getCompressionCodecName(),
options.getBlockSize(),
options.getDataPageSize(),
options.getDictionaryPageSize(),
options.isEnableDictionary(),
options.isEnableValidation(),
options.getWriterVersion());
}
},
V_12(8) {
@Override
<T> ParquetWriter<T> newInstance(
Path path,
WriteSupport<T> writeSupport,
ParquetFileOutput.Options options,
Configuration configuration) throws IOException {
return new ParquetWriter<>(
path,
writeSupport,
options.getCompressionCodecName(),
options.getBlockSize(),
options.getDataPageSize(),
options.getDictionaryPageSize(),
options.isEnableDictionary(),
options.isEnableValidation());
}
},
UNKNOWN(2) {
@Override
<T> ParquetWriter<T> newInstance(
Path path,
WriteSupport<T> writeSupport,
ParquetFileOutput.Options options,
Configuration configuration) throws IOException {
return new ParquetWriter<>(path, writeSupport);
}
},
;
private static final Class<?>[] PARAMETER_TYPES = {
Path.class, //
WriteSupport.class, // 2: minimum
CompressionCodecName.class, //
int.class, //
int.class, //
int.class, //
boolean.class, //
boolean.class, // 8: v1.2
ParquetProperties.WriterVersion.class, // 9: v1.3
Configuration.class, // 10 v1.5
};
final int parameterCount;
private transient volatile Constructor<?> constructor;
ParquetVersion(int parameterCount) {
this.parameterCount = parameterCount;
}
private Constructor<?> resolve() {
if (constructor != null) {
return constructor;
}
Class<?>[] params = Arrays.copyOfRange(PARAMETER_TYPES, 0, parameterCount);
try {
constructor = ParquetWriter.class.getConstructor(params);
} catch (Exception e) {
LOG.trace(MessageFormat.format(
"Mismatch Parquet library version: {0} {1}", //$NON-NLS-1$
name(),
Arrays.toString(params)), e);
return null;
}
return constructor;
}
static ParquetVersion getSupportedVersion() {
for (ParquetVersion version : values()) {
if (version.resolve() != null) {
if (LOG.isDebugEnabled()) {
LOG.debug(MessageFormat.format(
"Detected Parquet library version: {0}", //$NON-NLS-1$
version));
}
return version;
}
}
throw new IllegalStateException();
}
@SuppressWarnings("unchecked")
<T> ParquetWriter<T> newInstance(
Path path,
WriteSupport<T> writeSupport,
ParquetFileOutput.Options options,
Configuration configuration) throws IOException {
Object[] argumentsCandidate = {
path,
writeSupport,
options.getCompressionCodecName(),
options.getBlockSize(),
options.getDataPageSize(),
options.getDictionaryPageSize(),
options.isEnableDictionary(),
options.isEnableValidation(),
options.getWriterVersion(),
configuration,
};
Object[] arguments = Arrays.copyOfRange(argumentsCandidate, 0, parameterCount);
try {
return (ParquetWriter<T>) resolve().newInstance(arguments);
} catch (InvocationTargetException e) {
Throwable cause = e.getCause();
if (cause instanceof IOException) {
throw (IOException) cause;
} else if (cause instanceof RuntimeException) {
throw (RuntimeException) cause;
} else if (cause instanceof Error) {
throw (Error) cause;
} else {
throw new IllegalStateException(e);
}
} catch (Exception e) {
throw new IllegalArgumentException(e);
}
}
}
/**
* The parquet file writing options.
* @since 0.7.0
*/
public static final class Options {
private CompressionCodecName compressionCodecName = CompressionCodecName.SNAPPY;
private int blockSize = ParquetWriter.DEFAULT_BLOCK_SIZE;
private int dataPageSize = ParquetWriter.DEFAULT_PAGE_SIZE;
private int dictionaryPageSize = ParquetWriter.DEFAULT_PAGE_SIZE;
private boolean enableDictionary = true;
private boolean enableValidation = false;
private WriterVersion writerVersion = WriterVersion.PARQUET_1_0;
/**
* Returns the compression codec name.
* @return the compression codec name
*/
public CompressionCodecName getCompressionCodecName() {
return compressionCodecName;
}
/**
* Sets the compression codec name.
* @param value the value
*/
public void setCompressionCodecName(CompressionCodecName value) {
this.compressionCodecName = value;
}
/**
* Returns the block size (in bytes).
* @return the block size
*/
public int getBlockSize() {
return blockSize;
}
/**
* Sets the block size (in bytes).
* @param value the value
*/
public void setBlockSize(int value) {
this.blockSize = value;
}
/**
* Returns the data page size (in bytes).
* @return the data page size
*/
public int getDataPageSize() {
return dataPageSize;
}
/**
* Sets the data page size (in bytes).
* @param value the value
*/
public void setDataPageSize(int value) {
this.dataPageSize = value;
}
/**
* Returns the dictionary page size (in bytes).
* @return the dictionary page size
*/
public int getDictionaryPageSize() {
return dictionaryPageSize;
}
/**
* Sets the dictionary page size (in bytes).
* @param value the value
*/
public void setDictionaryPageSize(int value) {
this.dictionaryPageSize = value;
}
/**
* Returns whether the dictionary is enabled or not.
* @return {@code true} if enabled, otherwise {@code false}
*/
public boolean isEnableDictionary() {
return enableDictionary;
}
/**
* Sets whether the dictionary is enabled or not.
* @param value the value
*/
public void setEnableDictionary(boolean value) {
this.enableDictionary = value;
}
/**
* Returns whether the schema validation is enabled or not.
* @return {@code true} if enabled, otherwise {@code false}
*/
public boolean isEnableValidation() {
return enableValidation;
}
/**
* Sets whether the schema validation is enabled or not.
* @param value the value
*/
public void setEnableValidation(boolean value) {
this.enableValidation = value;
}
/**
* Returns the writer version.
* @return the writer version
*/
public WriterVersion getWriterVersion() {
return writerVersion;
}
/**
* Sets the writer version.
* @param value the value
*/
public void setWriterVersion(WriterVersion value) {
this.writerVersion = value;
}
}
}