/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.facebook.presto.orc; import com.facebook.presto.orc.metadata.statistics.BooleanStatistics; import com.facebook.presto.orc.metadata.statistics.ColumnStatistics; import com.facebook.presto.orc.metadata.statistics.HiveBloomFilter; import com.facebook.presto.orc.metadata.statistics.RangeStatistics; import com.facebook.presto.spi.predicate.Domain; import com.facebook.presto.spi.predicate.Range; import com.facebook.presto.spi.predicate.TupleDomain; import com.facebook.presto.spi.predicate.ValueSet; import com.facebook.presto.spi.type.DecimalType; import com.facebook.presto.spi.type.StandardTypes; import com.facebook.presto.spi.type.Type; import com.facebook.presto.spi.type.VarbinaryType; import com.facebook.presto.spi.type.VarcharType; import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableList; import io.airlift.slice.Slice; import org.apache.hive.common.util.BloomFilter; import java.util.Collection; import java.util.List; import java.util.Map; import java.util.Optional; import java.util.function.Function; import static com.facebook.presto.spi.type.BigintType.BIGINT; import static com.facebook.presto.spi.type.BooleanType.BOOLEAN; import static com.facebook.presto.spi.type.Chars.isCharType; import static com.facebook.presto.spi.type.Chars.trimSpacesAndTruncateToLength; import static com.facebook.presto.spi.type.Decimals.encodeUnscaledValue; import static com.facebook.presto.spi.type.Decimals.isLongDecimal; import static com.facebook.presto.spi.type.Decimals.isShortDecimal; import static com.facebook.presto.spi.type.Decimals.rescale; import static com.facebook.presto.spi.type.DoubleType.DOUBLE; import static com.facebook.presto.spi.type.IntegerType.INTEGER; import static com.facebook.presto.spi.type.RealType.REAL; import static com.facebook.presto.spi.type.SmallintType.SMALLINT; import static com.facebook.presto.spi.type.TinyintType.TINYINT; import static com.facebook.presto.spi.type.Varchars.isVarcharType; import static com.google.common.base.MoreObjects.toStringHelper; import static com.google.common.base.Preconditions.checkArgument; import static java.lang.Float.floatToRawIntBits; import static java.util.Objects.requireNonNull; public class TupleDomainOrcPredicate<C> implements OrcPredicate { private final TupleDomain<C> effectivePredicate; private final List<ColumnReference<C>> columnReferences; private final boolean orcBloomFiltersEnabled; public TupleDomainOrcPredicate(TupleDomain<C> effectivePredicate, List<ColumnReference<C>> columnReferences, boolean orcBloomFiltersEnabled) { this.effectivePredicate = requireNonNull(effectivePredicate, "effectivePredicate is null"); this.columnReferences = ImmutableList.copyOf(requireNonNull(columnReferences, "columnReferences is null")); this.orcBloomFiltersEnabled = orcBloomFiltersEnabled; } @Override public boolean matches(long numberOfRows, Map<Integer, ColumnStatistics> statisticsByColumnIndex) { Optional<Map<C, Domain>> optionalEffectivePredicateDomains = effectivePredicate.getDomains(); if (!optionalEffectivePredicateDomains.isPresent()) { // effective predicate is none, so skip this section return false; } Map<C, Domain> effectivePredicateDomains = optionalEffectivePredicateDomains.get(); for (ColumnReference<C> columnReference : columnReferences) { Domain predicateDomain = effectivePredicateDomains.get(columnReference.getColumn()); if (predicateDomain == null) { // no predicate on this column, so we can't exclude this section continue; } ColumnStatistics columnStatistics = statisticsByColumnIndex.get(columnReference.getOrdinal()); if (columnStatistics == null) { // no statistics for this column, so we can't exclude this section continue; } if (!columnOverlaps(columnReference, predicateDomain, numberOfRows, columnStatistics)) { return false; } } // this section was not excluded return true; } private boolean columnOverlaps(ColumnReference<C> columnReference, Domain predicateDomain, long numberOfRows, ColumnStatistics columnStatistics) { Domain stripeDomain = getDomain(columnReference.getType(), numberOfRows, columnStatistics); if (!stripeDomain.overlaps(predicateDomain)) { // there is no overlap between the predicate and this column return false; } // if bloom filters are not enabled, we can not restrict the range overlap if (!orcBloomFiltersEnabled) { return true; } // if there an overlap in null values, the bloom filter can not eliminate the overlap if (predicateDomain.isNullAllowed() && stripeDomain.isNullAllowed()) { return true; } // extract the discrete values from the predicate Optional<Collection<Object>> discreteValues = extractDiscreteValues(predicateDomain.getValues()); if (!discreteValues.isPresent()) { // values are not discrete, so we can't exclude this section return true; } HiveBloomFilter bloomFilter = columnStatistics.getBloomFilter(); if (bloomFilter == null) { // no bloom filter so we can't exclude this section return true; } // if none of the discrete predicate values are found in the bloom filter, there is no overlap and the section should be skipped if (discreteValues.get().stream().noneMatch(value -> checkInBloomFilter(bloomFilter, value, stripeDomain.getType()))) { return false; } return true; } @VisibleForTesting public static Optional<Collection<Object>> extractDiscreteValues(ValueSet valueSet) { return valueSet.getValuesProcessor().transform( ranges -> { ImmutableList.Builder<Object> discreteValues = ImmutableList.builder(); for (Range range : ranges.getOrderedRanges()) { if (!range.isSingleValue()) { return Optional.empty(); } discreteValues.add(range.getSingleValue()); } return Optional.of(discreteValues.build()); }, discreteValues -> Optional.of(discreteValues.getValues()), allOrNone -> allOrNone.isAll() ? Optional.empty() : Optional.of(ImmutableList.of())); } // checks whether a value part of the effective predicate is likely to be part of this bloom filter @VisibleForTesting public static boolean checkInBloomFilter(BloomFilter bloomFilter, Object predicateValue, Type sqlType) { if (sqlType == TINYINT || sqlType == SMALLINT || sqlType == INTEGER || sqlType == BIGINT) { return bloomFilter.testLong(((Number) predicateValue).longValue()); } if (sqlType == DOUBLE) { return bloomFilter.testDouble((Double) predicateValue); } if (sqlType instanceof VarcharType || sqlType instanceof VarbinaryType) { return bloomFilter.test(((Slice) predicateValue).getBytes()); } // todo support DECIMAL, FLOAT, DATE, TIMESTAMP, and CHAR return true; } @VisibleForTesting public static Domain getDomain(Type type, long rowCount, ColumnStatistics columnStatistics) { if (rowCount == 0) { return Domain.none(type); } if (columnStatistics == null) { return Domain.all(type); } if (columnStatistics.hasNumberOfValues() && columnStatistics.getNumberOfValues() == 0) { return Domain.onlyNull(type); } boolean hasNullValue = columnStatistics.getNumberOfValues() != rowCount; if (type.getJavaType() == boolean.class && columnStatistics.getBooleanStatistics() != null) { BooleanStatistics booleanStatistics = columnStatistics.getBooleanStatistics(); boolean hasTrueValues = (booleanStatistics.getTrueValueCount() != 0); boolean hasFalseValues = (columnStatistics.getNumberOfValues() != booleanStatistics.getTrueValueCount()); if (hasTrueValues && hasFalseValues) { return Domain.all(BOOLEAN); } if (hasTrueValues) { return Domain.create(ValueSet.of(BOOLEAN, true), hasNullValue); } if (hasFalseValues) { return Domain.create(ValueSet.of(BOOLEAN, false), hasNullValue); } } else if (isShortDecimal(type) && columnStatistics.getDecimalStatistics() != null) { return createDomain(type, hasNullValue, columnStatistics.getDecimalStatistics(), value -> rescale(value, (DecimalType) type).unscaledValue().longValue()); } else if (isLongDecimal(type) && columnStatistics.getDecimalStatistics() != null) { return createDomain(type, hasNullValue, columnStatistics.getDecimalStatistics(), value -> encodeUnscaledValue(rescale(value, (DecimalType) type).unscaledValue())); } else if (isCharType(type) && columnStatistics.getStringStatistics() != null) { return createDomain(type, hasNullValue, columnStatistics.getStringStatistics(), value -> trimSpacesAndTruncateToLength(value, type)); } else if (isVarcharType(type) && columnStatistics.getStringStatistics() != null) { return createDomain(type, hasNullValue, columnStatistics.getStringStatistics()); } else if (type.getTypeSignature().getBase().equals(StandardTypes.DATE) && columnStatistics.getDateStatistics() != null) { return createDomain(type, hasNullValue, columnStatistics.getDateStatistics(), value -> (long) value); } else if (type.getJavaType() == long.class && columnStatistics.getIntegerStatistics() != null) { return createDomain(type, hasNullValue, columnStatistics.getIntegerStatistics()); } else if (type.getJavaType() == double.class && columnStatistics.getDoubleStatistics() != null) { return createDomain(type, hasNullValue, columnStatistics.getDoubleStatistics()); } else if (REAL.equals(type) && columnStatistics.getDoubleStatistics() != null) { return createDomain(type, hasNullValue, columnStatistics.getDoubleStatistics(), value -> (long) floatToRawIntBits(value.floatValue())); } return Domain.create(ValueSet.all(type), hasNullValue); } private static <T extends Comparable<T>> Domain createDomain(Type type, boolean hasNullValue, RangeStatistics<T> rangeStatistics) { return createDomain(type, hasNullValue, rangeStatistics, value -> value); } private static <F, T extends Comparable<T>> Domain createDomain(Type type, boolean hasNullValue, RangeStatistics<F> rangeStatistics, Function<F, T> function) { F min = rangeStatistics.getMin(); F max = rangeStatistics.getMax(); if (min != null && max != null) { return Domain.create(ValueSet.ofRanges(Range.range(type, function.apply(min), true, function.apply(max), true)), hasNullValue); } if (max != null) { return Domain.create(ValueSet.ofRanges(Range.lessThanOrEqual(type, function.apply(max))), hasNullValue); } if (min != null) { return Domain.create(ValueSet.ofRanges(Range.greaterThanOrEqual(type, function.apply(min))), hasNullValue); } return Domain.create(ValueSet.all(type), hasNullValue); } public static class ColumnReference<C> { private final C column; private final int ordinal; private final Type type; public ColumnReference(C column, int ordinal, Type type) { this.column = requireNonNull(column, "column is null"); checkArgument(ordinal >= 0, "ordinal is negative"); this.ordinal = ordinal; this.type = requireNonNull(type, "type is null"); } public C getColumn() { return column; } public int getOrdinal() { return ordinal; } public Type getType() { return type; } @Override public String toString() { return toStringHelper(this) .add("column", column) .add("ordinal", ordinal) .add("type", type) .toString(); } } }