/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.facebook.presto.orc.metadata;
import com.facebook.presto.orc.metadata.ColumnEncoding.ColumnEncodingKind;
import com.facebook.presto.orc.metadata.OrcType.OrcTypeKind;
import com.facebook.presto.orc.metadata.PostScript.HiveWriterVersion;
import com.facebook.presto.orc.metadata.Stream.StreamKind;
import com.facebook.presto.orc.metadata.statistics.BooleanStatistics;
import com.facebook.presto.orc.metadata.statistics.ColumnStatistics;
import com.facebook.presto.orc.metadata.statistics.DoubleStatistics;
import com.facebook.presto.orc.metadata.statistics.HiveBloomFilter;
import com.facebook.presto.orc.metadata.statistics.IntegerStatistics;
import com.facebook.presto.orc.metadata.statistics.StringStatistics;
import com.facebook.presto.orc.proto.DwrfProto;
import com.facebook.presto.orc.protobuf.CodedInputStream;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Iterables;
import io.airlift.slice.Slice;
import io.airlift.slice.Slices;
import java.io.IOException;
import java.io.InputStream;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import static com.facebook.presto.orc.metadata.CompressionKind.SNAPPY;
import static com.facebook.presto.orc.metadata.CompressionKind.UNCOMPRESSED;
import static com.facebook.presto.orc.metadata.CompressionKind.ZLIB;
import static com.facebook.presto.orc.metadata.CompressionKind.ZSTD;
import static com.facebook.presto.orc.metadata.OrcMetadataReader.getMaxSlice;
import static com.facebook.presto.orc.metadata.OrcMetadataReader.getMinSlice;
import static com.facebook.presto.orc.metadata.PostScript.HiveWriterVersion.ORIGINAL;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkState;
import static java.lang.Math.toIntExact;
public class DwrfMetadataReader
implements MetadataReader
{
@Override
public PostScript readPostScript(byte[] data, int offset, int length)
throws IOException
{
CodedInputStream input = CodedInputStream.newInstance(data, offset, length);
DwrfProto.PostScript postScript = DwrfProto.PostScript.parseFrom(input);
return new PostScript(
ImmutableList.of(),
postScript.getFooterLength(),
0,
toCompression(postScript.getCompression()),
postScript.getCompressionBlockSize(),
ORIGINAL); // DWRF doesn't have the equivalent of Hive writer version, and it is not clear if HIVE-8732 has been fixed
}
@Override
public Metadata readMetadata(HiveWriterVersion hiveWriterVersion, InputStream inputStream)
throws IOException
{
return new Metadata(ImmutableList.of());
}
@Override
public Footer readFooter(HiveWriterVersion hiveWriterVersion, InputStream inputStream)
throws IOException
{
CodedInputStream input = CodedInputStream.newInstance(inputStream);
DwrfProto.Footer footer = DwrfProto.Footer.parseFrom(input);
// todo enable file stats when DWRF team verifies that the stats are correct
// List<ColumnStatistics> fileStats = toColumnStatistics(hiveWriterVersion, footer.getStatisticsList(), false);
List<ColumnStatistics> fileStats = ImmutableList.of();
return new Footer(
footer.getNumberOfRows(),
footer.getRowIndexStride(),
toStripeInformation(footer.getStripesList()),
toType(footer.getTypesList()),
fileStats,
toUserMetadata(footer.getMetadataList()));
}
private static List<StripeInformation> toStripeInformation(List<DwrfProto.StripeInformation> types)
{
return ImmutableList.copyOf(Iterables.transform(types, DwrfMetadataReader::toStripeInformation));
}
private static StripeInformation toStripeInformation(DwrfProto.StripeInformation stripeInformation)
{
return new StripeInformation(
toIntExact(stripeInformation.getNumberOfRows()),
stripeInformation.getOffset(),
stripeInformation.getIndexLength(),
stripeInformation.getDataLength(),
stripeInformation.getFooterLength());
}
@Override
public StripeFooter readStripeFooter(HiveWriterVersion hiveWriterVersion, List<OrcType> types, InputStream inputStream)
throws IOException
{
CodedInputStream input = CodedInputStream.newInstance(inputStream);
DwrfProto.StripeFooter stripeFooter = DwrfProto.StripeFooter.parseFrom(input);
return new StripeFooter(toStream(stripeFooter.getStreamsList()), toColumnEncoding(types, stripeFooter.getColumnsList()));
}
private static Stream toStream(DwrfProto.Stream stream)
{
return new Stream(stream.getColumn(), toStreamKind(stream.getKind()), toIntExact(stream.getLength()), stream.getUseVInts());
}
private static List<Stream> toStream(List<DwrfProto.Stream> streams)
{
return ImmutableList.copyOf(Iterables.transform(streams, DwrfMetadataReader::toStream));
}
private static ColumnEncoding toColumnEncoding(OrcTypeKind type, DwrfProto.ColumnEncoding columnEncoding)
{
return new ColumnEncoding(toColumnEncodingKind(type, columnEncoding.getKind()), columnEncoding.getDictionarySize());
}
private static List<ColumnEncoding> toColumnEncoding(List<OrcType> types, List<DwrfProto.ColumnEncoding> columnEncodings)
{
checkArgument(types.size() == columnEncodings.size());
ImmutableList.Builder<ColumnEncoding> encodings = ImmutableList.builder();
for (int i = 0; i < types.size(); i++) {
OrcType type = types.get(i);
encodings.add(toColumnEncoding(type.getOrcTypeKind(), columnEncodings.get(i)));
}
return encodings.build();
}
@Override
public List<RowGroupIndex> readRowIndexes(HiveWriterVersion hiveWriterVersion, InputStream inputStream)
throws IOException
{
CodedInputStream input = CodedInputStream.newInstance(inputStream);
DwrfProto.RowIndex rowIndex = DwrfProto.RowIndex.parseFrom(input);
return ImmutableList.copyOf(Iterables.transform(rowIndex.getEntryList(), rowIndexEntry -> toRowGroupIndex(hiveWriterVersion, rowIndexEntry)));
}
@Override
public List<HiveBloomFilter> readBloomFilterIndexes(InputStream inputStream)
throws IOException
{
// DWRF does not have bloom filters
return ImmutableList.of();
}
private static RowGroupIndex toRowGroupIndex(HiveWriterVersion hiveWriterVersion, DwrfProto.RowIndexEntry rowIndexEntry)
{
List<Long> positionsList = rowIndexEntry.getPositionsList();
ImmutableList.Builder<Integer> positions = ImmutableList.builder();
for (int index = 0; index < positionsList.size(); index++) {
long longPosition = positionsList.get(index);
int intPosition = (int) longPosition;
checkState(intPosition == longPosition, "Expected checkpoint position %s, to be an integer", index);
positions.add(intPosition);
}
return new RowGroupIndex(positions.build(), toColumnStatistics(hiveWriterVersion, rowIndexEntry.getStatistics(), true));
}
private static List<ColumnStatistics> toColumnStatistics(HiveWriterVersion hiveWriterVersion, List<DwrfProto.ColumnStatistics> columnStatistics, boolean isRowGroup)
{
if (columnStatistics == null) {
return ImmutableList.of();
}
return ImmutableList.copyOf(Iterables.transform(columnStatistics, statistics -> toColumnStatistics(hiveWriterVersion, statistics, isRowGroup)));
}
private Map<String, Slice> toUserMetadata(List<DwrfProto.UserMetadataItem> metadataList)
{
ImmutableMap.Builder<String, Slice> mapBuilder = ImmutableMap.builder();
for (DwrfProto.UserMetadataItem item : metadataList) {
mapBuilder.put(item.getName(), Slices.wrappedBuffer(item.getValue().toByteArray()));
}
return mapBuilder.build();
}
private static ColumnStatistics toColumnStatistics(HiveWriterVersion hiveWriterVersion, DwrfProto.ColumnStatistics statistics, boolean isRowGroup)
{
return new ColumnStatistics(
statistics.getNumberOfValues(),
toBooleanStatistics(statistics.getBucketStatistics()),
toIntegerStatistics(statistics.getIntStatistics()),
toDoubleStatistics(statistics.getDoubleStatistics()),
toStringStatistics(hiveWriterVersion, statistics.getStringStatistics(), isRowGroup),
null,
null,
null);
}
private static BooleanStatistics toBooleanStatistics(DwrfProto.BucketStatistics bucketStatistics)
{
if (bucketStatistics.getCountCount() == 0) {
return null;
}
return new BooleanStatistics(bucketStatistics.getCount(0));
}
private static IntegerStatistics toIntegerStatistics(DwrfProto.IntegerStatistics integerStatistics)
{
if (!integerStatistics.hasMinimum() && !integerStatistics.hasMaximum()) {
return null;
}
return new IntegerStatistics(
integerStatistics.hasMinimum() ? integerStatistics.getMinimum() : null,
integerStatistics.hasMaximum() ? integerStatistics.getMaximum() : null);
}
private static DoubleStatistics toDoubleStatistics(DwrfProto.DoubleStatistics doubleStatistics)
{
if (!doubleStatistics.hasMinimum() && !doubleStatistics.hasMaximum()) {
return null;
}
// if either min, max, or sum is NaN, ignore the stat
if ((doubleStatistics.hasMinimum() && Double.isNaN(doubleStatistics.getMinimum())) ||
(doubleStatistics.hasMaximum() && Double.isNaN(doubleStatistics.getMaximum())) ||
(doubleStatistics.hasSum() && Double.isNaN(doubleStatistics.getSum()))) {
return null;
}
return new DoubleStatistics(
doubleStatistics.hasMinimum() ? doubleStatistics.getMinimum() : null,
doubleStatistics.hasMaximum() ? doubleStatistics.getMaximum() : null);
}
private static StringStatistics toStringStatistics(HiveWriterVersion hiveWriterVersion, DwrfProto.StringStatistics stringStatistics, boolean isRowGroup)
{
if (hiveWriterVersion == ORIGINAL && !isRowGroup) {
return null;
}
if (!stringStatistics.hasMinimum() && !stringStatistics.hasMaximum()) {
return null;
}
Slice minimum = stringStatistics.hasMinimum() ? getMinSlice(stringStatistics.getMinimum()) : null;
Slice maximum = stringStatistics.hasMaximum() ? getMaxSlice(stringStatistics.getMaximum()) : null;
return new StringStatistics(minimum, maximum);
}
private static OrcType toType(DwrfProto.Type type)
{
return new OrcType(toTypeKind(type.getKind()), type.getSubtypesList(), type.getFieldNamesList(), Optional.empty(), Optional.empty());
}
private static List<OrcType> toType(List<DwrfProto.Type> types)
{
return ImmutableList.copyOf(Iterables.transform(types, DwrfMetadataReader::toType));
}
private static OrcTypeKind toTypeKind(DwrfProto.Type.Kind kind)
{
switch (kind) {
case BOOLEAN:
return OrcTypeKind.BOOLEAN;
case BYTE:
return OrcTypeKind.BYTE;
case SHORT:
return OrcTypeKind.SHORT;
case INT:
return OrcTypeKind.INT;
case LONG:
return OrcTypeKind.LONG;
case FLOAT:
return OrcTypeKind.FLOAT;
case DOUBLE:
return OrcTypeKind.DOUBLE;
case STRING:
return OrcTypeKind.STRING;
case BINARY:
return OrcTypeKind.BINARY;
case TIMESTAMP:
return OrcTypeKind.TIMESTAMP;
case LIST:
return OrcTypeKind.LIST;
case MAP:
return OrcTypeKind.MAP;
case STRUCT:
return OrcTypeKind.STRUCT;
case UNION:
return OrcTypeKind.UNION;
default:
throw new IllegalArgumentException(kind + " data type not implemented yet");
}
}
private static StreamKind toStreamKind(DwrfProto.Stream.Kind kind)
{
switch (kind) {
case PRESENT:
return StreamKind.PRESENT;
case DATA:
return StreamKind.DATA;
case LENGTH:
return StreamKind.LENGTH;
case DICTIONARY_DATA:
return StreamKind.DICTIONARY_DATA;
case DICTIONARY_COUNT:
return StreamKind.DICTIONARY_COUNT;
case NANO_DATA:
return StreamKind.SECONDARY;
case ROW_INDEX:
return StreamKind.ROW_INDEX;
case IN_DICTIONARY:
return StreamKind.IN_DICTIONARY;
case STRIDE_DICTIONARY:
return StreamKind.ROW_GROUP_DICTIONARY;
case STRIDE_DICTIONARY_LENGTH:
return StreamKind.ROW_GROUP_DICTIONARY_LENGTH;
default:
throw new IllegalArgumentException(kind + " stream type not implemented yet");
}
}
private static ColumnEncodingKind toColumnEncodingKind(OrcTypeKind type, DwrfProto.ColumnEncoding.Kind kind)
{
switch (kind) {
case DIRECT:
if (type == OrcTypeKind.SHORT || type == OrcTypeKind.INT || type == OrcTypeKind.LONG) {
return ColumnEncodingKind.DWRF_DIRECT;
}
else {
return ColumnEncodingKind.DIRECT;
}
case DICTIONARY:
return ColumnEncodingKind.DICTIONARY;
default:
throw new IllegalArgumentException(kind + " stream encoding not implemented yet");
}
}
private static CompressionKind toCompression(DwrfProto.CompressionKind compression)
{
switch (compression) {
case NONE:
return UNCOMPRESSED;
case ZLIB:
return ZLIB;
case SNAPPY:
return SNAPPY;
case ZSTD:
return ZSTD;
default:
throw new IllegalArgumentException(compression + " compression not implemented yet");
}
}
}