/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.facebook.presto.raptor.storage; import com.facebook.presto.orc.OrcPredicate; import com.facebook.presto.orc.OrcReader; import com.facebook.presto.orc.OrcRecordReader; import com.facebook.presto.orc.memory.AggregatedMemoryContext; import com.facebook.presto.raptor.metadata.ColumnStats; import com.facebook.presto.spi.PrestoException; import com.facebook.presto.spi.block.Block; import com.facebook.presto.spi.type.BigintType; import com.facebook.presto.spi.type.BooleanType; import com.facebook.presto.spi.type.DateType; import com.facebook.presto.spi.type.DoubleType; import com.facebook.presto.spi.type.TimestampType; import com.facebook.presto.spi.type.Type; import com.facebook.presto.spi.type.VarcharType; import com.google.common.collect.ImmutableMap; import io.airlift.slice.Slice; import java.io.IOException; import java.util.List; import java.util.Optional; import static com.facebook.presto.raptor.RaptorErrorCode.RAPTOR_ERROR; import static java.lang.Double.isInfinite; import static java.lang.Double.isNaN; import static org.joda.time.DateTimeZone.UTC; public final class ShardStats { /** * Maximum length of a binary value stored in an index. */ public static final int MAX_BINARY_INDEX_SIZE = 100; private ShardStats() {} public static Slice truncateIndexValue(Slice slice) { if (slice.length() > MAX_BINARY_INDEX_SIZE) { return slice.slice(0, MAX_BINARY_INDEX_SIZE); } return slice; } public static Optional<ColumnStats> computeColumnStats(OrcReader orcReader, long columnId, Type type) throws IOException { return Optional.ofNullable(doComputeColumnStats(orcReader, columnId, type)); } private static ColumnStats doComputeColumnStats(OrcReader orcReader, long columnId, Type type) throws IOException { int columnIndex = columnIndex(orcReader.getColumnNames(), columnId); OrcRecordReader reader = orcReader.createRecordReader(ImmutableMap.of(columnIndex, type), OrcPredicate.TRUE, UTC, new AggregatedMemoryContext()); if (type.equals(BooleanType.BOOLEAN)) { return indexBoolean(type, reader, columnIndex, columnId); } if (type.equals(BigintType.BIGINT) || type.equals(DateType.DATE) || type.equals(TimestampType.TIMESTAMP)) { return indexLong(type, reader, columnIndex, columnId); } if (type.equals(DoubleType.DOUBLE)) { return indexDouble(type, reader, columnIndex, columnId); } if (type instanceof VarcharType) { return indexString(type, reader, columnIndex, columnId); } return null; } private static int columnIndex(List<String> columnNames, long columnId) { int index = columnNames.indexOf(String.valueOf(columnId)); if (index == -1) { throw new PrestoException(RAPTOR_ERROR, "Missing column ID: " + columnId); } return index; } private static ColumnStats indexBoolean(Type type, OrcRecordReader reader, int columnIndex, long columnId) throws IOException { boolean minSet = false; boolean maxSet = false; boolean min = false; boolean max = false; while (true) { int batchSize = reader.nextBatch(); if (batchSize <= 0) { break; } Block block = reader.readBlock(type, columnIndex); for (int i = 0; i < batchSize; i++) { if (block.isNull(i)) { continue; } boolean value = type.getBoolean(block, i); if (!minSet || Boolean.compare(value, min) < 0) { minSet = true; min = value; } if (!maxSet || Boolean.compare(value, max) > 0) { maxSet = true; max = value; } } } return new ColumnStats(columnId, minSet ? min : null, maxSet ? max : null); } private static ColumnStats indexLong(Type type, OrcRecordReader reader, int columnIndex, long columnId) throws IOException { boolean minSet = false; boolean maxSet = false; long min = 0; long max = 0; while (true) { int batchSize = reader.nextBatch(); if (batchSize <= 0) { break; } Block block = reader.readBlock(type, columnIndex); for (int i = 0; i < batchSize; i++) { if (block.isNull(i)) { continue; } long value = type.getLong(block, i); if (!minSet || (value < min)) { minSet = true; min = value; } if (!maxSet || (value > max)) { maxSet = true; max = value; } } } return new ColumnStats(columnId, minSet ? min : null, maxSet ? max : null); } private static ColumnStats indexDouble(Type type, OrcRecordReader reader, int columnIndex, long columnId) throws IOException { boolean minSet = false; boolean maxSet = false; double min = 0; double max = 0; while (true) { int batchSize = reader.nextBatch(); if (batchSize <= 0) { break; } Block block = reader.readBlock(type, columnIndex); for (int i = 0; i < batchSize; i++) { if (block.isNull(i)) { continue; } double value = type.getDouble(block, i); if (isNaN(value)) { continue; } if (value == -0.0) { value = 0.0; } if (!minSet || (value < min)) { minSet = true; min = value; } if (!maxSet || (value > max)) { maxSet = true; max = value; } } } if (isInfinite(min)) { minSet = false; } if (isInfinite(max)) { maxSet = false; } return new ColumnStats(columnId, minSet ? min : null, maxSet ? max : null); } private static ColumnStats indexString(Type type, OrcRecordReader reader, int columnIndex, long columnId) throws IOException { boolean minSet = false; boolean maxSet = false; Slice min = null; Slice max = null; while (true) { int batchSize = reader.nextBatch(); if (batchSize <= 0) { break; } Block block = reader.readBlock(type, columnIndex); for (int i = 0; i < batchSize; i++) { if (block.isNull(i)) { continue; } Slice slice = type.getSlice(block, i); slice = truncateIndexValue(slice); if (!minSet || (slice.compareTo(min) < 0)) { minSet = true; min = slice; } if (!maxSet || (slice.compareTo(max) > 0)) { maxSet = true; max = slice; } } } return new ColumnStats(columnId, minSet ? min.toStringUtf8() : null, maxSet ? max.toStringUtf8() : null); } }