/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.facebook.presto.hive.parquet.predicate;
import com.facebook.presto.hive.parquet.ParquetDictionaryPage;
import com.facebook.presto.hive.parquet.RichColumnDescriptor;
import com.facebook.presto.hive.parquet.dictionary.ParquetDictionary;
import com.facebook.presto.spi.predicate.Domain;
import com.facebook.presto.spi.predicate.Range;
import com.facebook.presto.spi.predicate.TupleDomain;
import com.facebook.presto.spi.predicate.ValueSet;
import com.facebook.presto.spi.type.Type;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import io.airlift.slice.Slice;
import io.airlift.slice.Slices;
import parquet.column.ColumnDescriptor;
import parquet.column.statistics.BinaryStatistics;
import parquet.column.statistics.BooleanStatistics;
import parquet.column.statistics.DoubleStatistics;
import parquet.column.statistics.FloatStatistics;
import parquet.column.statistics.IntStatistics;
import parquet.column.statistics.LongStatistics;
import parquet.column.statistics.Statistics;
import parquet.schema.PrimitiveType.PrimitiveTypeName;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.function.Function;
import static com.facebook.presto.hive.parquet.ParquetTypeUtils.getPrestoType;
import static com.facebook.presto.hive.parquet.predicate.ParquetPredicateUtils.isStatisticsOverflow;
import static com.facebook.presto.spi.type.BigintType.BIGINT;
import static com.facebook.presto.spi.type.BooleanType.BOOLEAN;
import static com.facebook.presto.spi.type.DoubleType.DOUBLE;
import static com.facebook.presto.spi.type.IntegerType.INTEGER;
import static com.facebook.presto.spi.type.RealType.REAL;
import static com.facebook.presto.spi.type.SmallintType.SMALLINT;
import static com.facebook.presto.spi.type.TinyintType.TINYINT;
import static com.facebook.presto.spi.type.Varchars.isVarcharType;
import static java.lang.Float.floatToRawIntBits;
import static java.util.Objects.requireNonNull;
public class TupleDomainParquetPredicate
implements ParquetPredicate
{
private final TupleDomain<ColumnDescriptor> effectivePredicate;
private final List<RichColumnDescriptor> columns;
public TupleDomainParquetPredicate(TupleDomain<ColumnDescriptor> effectivePredicate, List<RichColumnDescriptor> columns)
{
this.effectivePredicate = requireNonNull(effectivePredicate, "effectivePredicate is null");
this.columns = ImmutableList.copyOf(requireNonNull(columns, "columns is null"));
}
@Override
public boolean matches(long numberOfRows, Map<ColumnDescriptor, Statistics<?>> statistics)
{
if (numberOfRows == 0) {
return false;
}
ImmutableMap.Builder<ColumnDescriptor, Domain> domains = ImmutableMap.builder();
for (RichColumnDescriptor column : columns) {
Statistics<?> columnStatistics = statistics.get(column);
Domain domain;
Type type = getPrestoType(column);
if (columnStatistics == null || columnStatistics.isEmpty()) {
// no stats for column
domain = Domain.all(type);
}
else {
domain = getDomain(type, numberOfRows, columnStatistics);
}
domains.put(column, domain);
}
TupleDomain<ColumnDescriptor> stripeDomain = TupleDomain.withColumnDomains(domains.build());
return effectivePredicate.overlaps(stripeDomain);
}
@Override
public boolean matches(Map<ColumnDescriptor, ParquetDictionaryDescriptor> dictionaries)
{
ImmutableMap.Builder<ColumnDescriptor, Domain> domains = ImmutableMap.builder();
for (RichColumnDescriptor column : columns) {
ParquetDictionaryDescriptor dictionaryDescriptor = dictionaries.get(column);
Domain domain = getDomain(getPrestoType(column), dictionaryDescriptor);
if (domain != null) {
domains.put(column, domain);
}
}
TupleDomain<ColumnDescriptor> stripeDomain = TupleDomain.withColumnDomains(domains.build());
return effectivePredicate.overlaps(stripeDomain);
}
@VisibleForTesting
public static Domain getDomain(Type type, long rowCount, Statistics<?> statistics)
{
if (statistics == null || statistics.isEmpty()) {
return Domain.all(type);
}
if (statistics.getNumNulls() == rowCount) {
return Domain.onlyNull(type);
}
boolean hasNullValue = statistics.getNumNulls() != 0L;
// ignore corrupted statistics
if (statistics.genericGetMin() == null || statistics.genericGetMax() == null) {
return Domain.create(ValueSet.all(type), hasNullValue);
}
if (type.equals(BOOLEAN) && statistics instanceof BooleanStatistics) {
BooleanStatistics booleanStatistics = (BooleanStatistics) statistics;
boolean hasTrueValues = !(booleanStatistics.getMax() == false && booleanStatistics.getMin() == false);
boolean hasFalseValues = !(booleanStatistics.getMax() == true && booleanStatistics.getMin() == true);
if (hasTrueValues && hasFalseValues) {
return Domain.all(type);
}
if (hasTrueValues) {
return Domain.create(ValueSet.of(type, true), hasNullValue);
}
if (hasFalseValues) {
return Domain.create(ValueSet.of(type, false), hasNullValue);
}
}
else if ((type.equals(BIGINT) || type.equals(TINYINT) || type.equals(SMALLINT) || type.equals(INTEGER)) && (statistics instanceof LongStatistics || statistics instanceof IntStatistics)) {
ParquetIntegerStatistics parquetIntegerStatistics;
if (statistics instanceof LongStatistics) {
LongStatistics longStatistics = (LongStatistics) statistics;
// ignore corrupted statistics
if (longStatistics.genericGetMin() > longStatistics.genericGetMax()) {
return Domain.create(ValueSet.all(type), hasNullValue);
}
parquetIntegerStatistics = new ParquetIntegerStatistics(longStatistics.genericGetMin(), longStatistics.genericGetMax());
}
else {
IntStatistics intStatistics = (IntStatistics) statistics;
// ignore corrupted statistics
if (intStatistics.genericGetMin() > intStatistics.genericGetMax()) {
return Domain.create(ValueSet.all(type), hasNullValue);
}
parquetIntegerStatistics = new ParquetIntegerStatistics((long) intStatistics.getMin(), (long) intStatistics.getMax());
}
if (isStatisticsOverflow(type, parquetIntegerStatistics)) {
return Domain.create(ValueSet.all(type), hasNullValue);
}
return createDomain(type, hasNullValue, parquetIntegerStatistics);
}
else if (type.equals(REAL) && statistics instanceof FloatStatistics) {
FloatStatistics floatStatistics = (FloatStatistics) statistics;
// ignore corrupted statistics
if (floatStatistics.genericGetMin() > floatStatistics.genericGetMax()) {
return Domain.create(ValueSet.all(type), hasNullValue);
}
ParquetIntegerStatistics parquetStatistics = new ParquetIntegerStatistics(
(long) floatToRawIntBits(floatStatistics.getMin()),
(long) floatToRawIntBits(floatStatistics.getMax())
);
return createDomain(type, hasNullValue, parquetStatistics);
}
else if (type.equals(DOUBLE) && statistics instanceof DoubleStatistics) {
DoubleStatistics doubleStatistics = (DoubleStatistics) statistics;
// ignore corrupted statistics
if (doubleStatistics.genericGetMin() > doubleStatistics.genericGetMax()) {
return Domain.create(ValueSet.all(type), hasNullValue);
}
ParquetDoubleStatistics parquetDoubleStatistics = new ParquetDoubleStatistics(doubleStatistics.genericGetMin(), doubleStatistics.genericGetMax());
return createDomain(type, hasNullValue, parquetDoubleStatistics);
}
else if (isVarcharType(type) && statistics instanceof BinaryStatistics) {
BinaryStatistics binaryStatistics = (BinaryStatistics) statistics;
Slice minSlice = Slices.wrappedBuffer(binaryStatistics.getMin().getBytes());
Slice maxSlice = Slices.wrappedBuffer(binaryStatistics.getMax().getBytes());
// ignore corrupted statistics
if (minSlice.compareTo(maxSlice) > 0) {
return Domain.create(ValueSet.all(type), hasNullValue);
}
ParquetStringStatistics parquetStringStatistics = new ParquetStringStatistics(minSlice, maxSlice);
return createDomain(type, hasNullValue, parquetStringStatistics);
}
return Domain.create(ValueSet.all(type), hasNullValue);
}
@VisibleForTesting
public static Domain getDomain(Type type, ParquetDictionaryDescriptor dictionaryDescriptor)
{
if (dictionaryDescriptor == null) {
return null;
}
ColumnDescriptor columnDescriptor = dictionaryDescriptor.getColumnDescriptor();
Optional<ParquetDictionaryPage> dictionaryPage = dictionaryDescriptor.getDictionaryPage();
if (!dictionaryPage.isPresent()) {
return null;
}
ParquetDictionary dictionary;
try {
dictionary = dictionaryPage.get().getEncoding().initDictionary(columnDescriptor, dictionaryPage.get());
}
catch (Exception e) {
// In case of exception, just continue reading the data, not using dictionary page at all
// OK to ignore exception when reading dictionaries
return null;
}
int dictionarySize = dictionaryPage.get().getDictionarySize();
if (type.equals(BIGINT) && columnDescriptor.getType() == PrimitiveTypeName.INT64) {
List<Domain> domains = new ArrayList<>();
for (int i = 0; i < dictionarySize; i++) {
domains.add(Domain.singleValue(type, dictionary.decodeToLong(i)));
}
domains.add(Domain.onlyNull(type));
return Domain.union(domains);
}
else if (type.equals(BIGINT) && columnDescriptor.getType() == PrimitiveTypeName.INT32) {
List<Domain> domains = new ArrayList<>();
for (int i = 0; i < dictionarySize; i++) {
domains.add(Domain.singleValue(type, (long) dictionary.decodeToInt(i)));
}
domains.add(Domain.onlyNull(type));
return Domain.union(domains);
}
else if (type.equals(DOUBLE) && columnDescriptor.getType() == PrimitiveTypeName.DOUBLE) {
List<Domain> domains = new ArrayList<>();
for (int i = 0; i < dictionarySize; i++) {
domains.add(Domain.singleValue(type, dictionary.decodeToDouble(i)));
}
domains.add(Domain.onlyNull(type));
return Domain.union(domains);
}
else if (type.equals(DOUBLE) && columnDescriptor.getType() == PrimitiveTypeName.FLOAT) {
List<Domain> domains = new ArrayList<>();
for (int i = 0; i < dictionarySize; i++) {
domains.add(Domain.singleValue(type, (double) dictionary.decodeToFloat(i)));
}
domains.add(Domain.onlyNull(type));
return Domain.union(domains);
}
else if (isVarcharType(type) && columnDescriptor.getType() == PrimitiveTypeName.BINARY) {
List<Domain> domains = new ArrayList<>();
for (int i = 0; i < dictionarySize; i++) {
domains.add(Domain.singleValue(type, Slices.wrappedBuffer(dictionary.decodeToBinary(i).getBytes())));
}
domains.add(Domain.onlyNull(type));
return Domain.union(domains);
}
return null;
}
private static <T extends Comparable<T>> Domain createDomain(Type type, boolean hasNullValue, ParquetRangeStatistics<T> rangeStatistics)
{
return createDomain(type, hasNullValue, rangeStatistics, value -> value);
}
private static <F, T extends Comparable<T>> Domain createDomain(Type type,
boolean hasNullValue,
ParquetRangeStatistics<F> rangeStatistics,
Function<F, T> function)
{
F min = rangeStatistics.getMin();
F max = rangeStatistics.getMax();
if (min != null && max != null) {
return Domain.create(ValueSet.ofRanges(Range.range(type, function.apply(min), true, function.apply(max), true)), hasNullValue);
}
if (max != null) {
return Domain.create(ValueSet.ofRanges(Range.lessThanOrEqual(type, function.apply(max))), hasNullValue);
}
if (min != null) {
return Domain.create(ValueSet.ofRanges(Range.greaterThanOrEqual(type, function.apply(min))), hasNullValue);
}
return Domain.create(ValueSet.all(type), hasNullValue);
}
}