/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.facebook.presto.hive.parquet;
import com.facebook.presto.hive.HiveColumnHandle;
import com.facebook.presto.spi.PrestoException;
import com.facebook.presto.spi.type.BigintType;
import com.facebook.presto.spi.type.BooleanType;
import com.facebook.presto.spi.type.DecimalType;
import com.facebook.presto.spi.type.DoubleType;
import com.facebook.presto.spi.type.IntegerType;
import com.facebook.presto.spi.type.RealType;
import com.facebook.presto.spi.type.TimestampType;
import com.facebook.presto.spi.type.Type;
import com.facebook.presto.spi.type.VarcharType;
import parquet.column.ColumnDescriptor;
import parquet.column.Encoding;
import parquet.io.ColumnIO;
import parquet.io.ColumnIOFactory;
import parquet.io.InvalidRecordException;
import parquet.io.ParquetDecodingException;
import parquet.io.PrimitiveColumnIO;
import parquet.schema.DecimalMetadata;
import parquet.schema.MessageType;
import java.util.List;
import java.util.Optional;
import static com.facebook.presto.spi.StandardErrorCode.NOT_SUPPORTED;
import static com.google.common.base.Preconditions.checkArgument;
import static java.util.Optional.empty;
import static parquet.schema.OriginalType.DECIMAL;
import static parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY;
import static parquet.schema.PrimitiveType.PrimitiveTypeName.BOOLEAN;
import static parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE;
import static parquet.schema.PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY;
import static parquet.schema.PrimitiveType.PrimitiveTypeName.FLOAT;
import static parquet.schema.PrimitiveType.PrimitiveTypeName.INT32;
import static parquet.schema.PrimitiveType.PrimitiveTypeName.INT64;
import static parquet.schema.PrimitiveType.PrimitiveTypeName.INT96;
public final class ParquetTypeUtils
{
private ParquetTypeUtils()
{
}
public static List<PrimitiveColumnIO> getColumns(MessageType fileSchema, MessageType requestedSchema)
{
return (new ColumnIOFactory()).getColumnIO(requestedSchema, fileSchema, true).getLeaves();
}
public static Optional<RichColumnDescriptor> getDescriptor(MessageType fileSchema, MessageType requestedSchema, List<String> path)
{
checkArgument(path.size() >= 1, "Parquet nested path should have at least one component");
int index = getPathIndex(fileSchema, requestedSchema, path);
return getDescriptor(fileSchema, requestedSchema, index);
}
public static Optional<RichColumnDescriptor> getDescriptor(MessageType fileSchema, MessageType requestedSchema, int index)
{
if (index == -1) {
return empty();
}
PrimitiveColumnIO columnIO = getColumns(fileSchema, requestedSchema).get(index);
ColumnDescriptor descriptor = columnIO.getColumnDescriptor();
return Optional.of(new RichColumnDescriptor(descriptor.getPath(), columnIO.getType().asPrimitiveType(), descriptor.getMaxRepetitionLevel(), descriptor.getMaxDefinitionLevel()));
}
private static int getPathIndex(MessageType fileSchema, MessageType requestedSchema, List<String> path)
{
int maxLevel = path.size();
List<PrimitiveColumnIO> columns = getColumns(fileSchema, requestedSchema);
int index = -1;
for (int columnIndex = 0; columnIndex < columns.size(); columnIndex++) {
ColumnIO[] fields = columns.get(columnIndex).getPath();
if (fields.length <= maxLevel) {
continue;
}
if (fields[maxLevel].getName().equalsIgnoreCase(path.get(maxLevel - 1))) {
boolean match = true;
for (int level = 0; level < maxLevel - 1; level++) {
if (!fields[level + 1].getName().equalsIgnoreCase(path.get(level))) {
match = false;
}
}
if (match) {
index = columnIndex;
}
}
}
return index;
}
public static Type getPrestoType(RichColumnDescriptor descriptor)
{
switch (descriptor.getType()) {
case BOOLEAN:
return BooleanType.BOOLEAN;
case BINARY:
return createDecimalType(descriptor).orElse(VarcharType.VARCHAR);
case FLOAT:
return RealType.REAL;
case DOUBLE:
return DoubleType.DOUBLE;
case INT32:
return createDecimalType(descriptor).orElse(IntegerType.INTEGER);
case INT64:
return createDecimalType(descriptor).orElse(BigintType.BIGINT);
case INT96:
return TimestampType.TIMESTAMP;
case FIXED_LEN_BYTE_ARRAY:
return createDecimalType(descriptor).orElseThrow(() -> new PrestoException(NOT_SUPPORTED, "Parquet type FIXED_LEN_BYTE_ARRAY supported as DECIMAL; got " + descriptor.getPrimitiveType().getOriginalType()));
default:
throw new PrestoException(NOT_SUPPORTED, "Unsupported parquet type: " + descriptor.getType());
}
}
public static int getFieldIndex(MessageType fileSchema, String name)
{
try {
return fileSchema.getFieldIndex(name);
}
catch (InvalidRecordException e) {
for (parquet.schema.Type type : fileSchema.getFields()) {
if (type.getName().equalsIgnoreCase(name)) {
return fileSchema.getFieldIndex(type.getName());
}
}
return -1;
}
}
public static parquet.schema.Type getParquetType(HiveColumnHandle column, MessageType messageType, boolean useParquetColumnNames)
{
if (useParquetColumnNames) {
return getParquetTypeByName(column.getName(), messageType);
}
if (column.getHiveColumnIndex() < messageType.getFieldCount()) {
return messageType.getType(column.getHiveColumnIndex());
}
return null;
}
public static ParquetEncoding getParquetEncoding(Encoding encoding)
{
switch (encoding) {
case PLAIN:
return ParquetEncoding.PLAIN;
case RLE:
return ParquetEncoding.RLE;
case BIT_PACKED:
return ParquetEncoding.BIT_PACKED;
case PLAIN_DICTIONARY:
return ParquetEncoding.PLAIN_DICTIONARY;
case DELTA_BINARY_PACKED:
return ParquetEncoding.DELTA_BINARY_PACKED;
case DELTA_LENGTH_BYTE_ARRAY:
return ParquetEncoding.DELTA_LENGTH_BYTE_ARRAY;
case DELTA_BYTE_ARRAY:
return ParquetEncoding.DELTA_BYTE_ARRAY;
case RLE_DICTIONARY:
return ParquetEncoding.RLE_DICTIONARY;
default:
throw new ParquetDecodingException("Unsupported Parquet encoding: " + encoding);
}
}
private static parquet.schema.Type getParquetTypeByName(String columnName, MessageType messageType)
{
if (messageType.containsField(columnName)) {
return messageType.getType(columnName);
}
// parquet is case-sensitive, but hive is not. all hive columns get converted to lowercase
// check for direct match above but if no match found, try case-insensitive match
for (parquet.schema.Type type : messageType.getFields()) {
if (type.getName().equalsIgnoreCase(columnName)) {
return type;
}
}
return null;
}
public static Optional<Type> createDecimalType(RichColumnDescriptor descriptor)
{
if (descriptor.getPrimitiveType().getOriginalType() != DECIMAL) {
return Optional.empty();
}
DecimalMetadata decimalMetadata = descriptor.getPrimitiveType().getDecimalMetadata();
return Optional.of(DecimalType.createDecimalType(decimalMetadata.getPrecision(), decimalMetadata.getScale()));
}
}