/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.facebook.presto.orc.reader; import com.facebook.presto.orc.OrcCorruptionException; import com.facebook.presto.orc.StreamDescriptor; import com.facebook.presto.orc.metadata.ColumnEncoding; import com.facebook.presto.orc.stream.BooleanInputStream; import com.facebook.presto.orc.stream.InputStreamSource; import com.facebook.presto.orc.stream.InputStreamSources; import com.facebook.presto.orc.stream.LongInputStream; import com.facebook.presto.spi.block.Block; import com.facebook.presto.spi.block.BlockBuilder; import com.facebook.presto.spi.block.BlockBuilderStatus; import com.facebook.presto.spi.type.Type; import org.joda.time.DateTime; import org.joda.time.DateTimeZone; import javax.annotation.Nonnull; import javax.annotation.Nullable; import java.io.IOException; import java.util.List; import static com.facebook.presto.orc.metadata.Stream.StreamKind.DATA; import static com.facebook.presto.orc.metadata.Stream.StreamKind.PRESENT; import static com.facebook.presto.orc.metadata.Stream.StreamKind.SECONDARY; import static com.facebook.presto.orc.stream.MissingInputStreamSource.missingStreamSource; import static com.google.common.base.MoreObjects.toStringHelper; import static java.util.Objects.requireNonNull; public class TimestampStreamReader implements StreamReader { private static final int MILLIS_PER_SECOND = 1000; private final StreamDescriptor streamDescriptor; private final long baseTimestampInSeconds; private int readOffset; private int nextBatchSize; @Nonnull private InputStreamSource<BooleanInputStream> presentStreamSource = missingStreamSource(BooleanInputStream.class); @Nullable private BooleanInputStream presentStream; private boolean[] nullVector = new boolean[0]; @Nonnull private InputStreamSource<LongInputStream> secondsStreamSource = missingStreamSource(LongInputStream.class); @Nullable private LongInputStream secondsStream; @Nonnull private InputStreamSource<LongInputStream> nanosStreamSource = missingStreamSource(LongInputStream.class); @Nullable private LongInputStream nanosStream; private long[] secondsVector = new long[0]; private long[] nanosVector = new long[0]; private boolean rowGroupOpen; public TimestampStreamReader(StreamDescriptor streamDescriptor, DateTimeZone hiveStorageTimeZone) { this.streamDescriptor = requireNonNull(streamDescriptor, "stream is null"); this.baseTimestampInSeconds = new DateTime(2015, 1, 1, 0, 0, requireNonNull(hiveStorageTimeZone, "hiveStorageTimeZone is null")).getMillis() / MILLIS_PER_SECOND; } @Override public void prepareNextRead(int batchSize) { readOffset += nextBatchSize; nextBatchSize = batchSize; } @Override public Block readBlock(Type type) throws IOException { if (!rowGroupOpen) { openRowGroup(); } if (readOffset > 0) { if (presentStream != null) { // skip ahead the present bit reader, but count the set bits // and use this as the skip size for the data reader readOffset = presentStream.countBitsSet(readOffset); } if (readOffset > 0) { if (secondsStream == null) { throw new OrcCorruptionException(streamDescriptor.getOrcDataSourceId(), "Value is not null but seconds stream is not present"); } if (nanosStream == null) { throw new OrcCorruptionException(streamDescriptor.getOrcDataSourceId(), "Value is not null but nanos stream is not present"); } secondsStream.skip(readOffset); nanosStream.skip(readOffset); } } if (secondsVector.length < nextBatchSize) { secondsVector = new long[nextBatchSize]; } if (nanosVector.length < nextBatchSize) { nanosVector = new long[nextBatchSize]; } BlockBuilder builder = type.createBlockBuilder(new BlockBuilderStatus(), nextBatchSize); if (presentStream == null) { if (secondsStream == null) { throw new OrcCorruptionException(streamDescriptor.getOrcDataSourceId(), "Value is not null but seconds stream is not present"); } if (nanosStream == null) { throw new OrcCorruptionException(streamDescriptor.getOrcDataSourceId(), "Value is not null but nanos stream is not present"); } secondsStream.nextLongVector(nextBatchSize, secondsVector); nanosStream.nextLongVector(nextBatchSize, nanosVector); // merge seconds and nanos together for (int i = 0; i < nextBatchSize; i++) { type.writeLong(builder, decodeTimestamp(secondsVector[i], nanosVector[i], baseTimestampInSeconds)); } } else { if (nullVector.length < nextBatchSize) { nullVector = new boolean[nextBatchSize]; } int nullValues = presentStream.getUnsetBits(nextBatchSize, nullVector); if (nullValues != nextBatchSize) { if (secondsStream == null) { throw new OrcCorruptionException(streamDescriptor.getOrcDataSourceId(), "Value is not null but seconds stream is not present"); } if (nanosStream == null) { throw new OrcCorruptionException(streamDescriptor.getOrcDataSourceId(), "Value is not null but nanos stream is not present"); } secondsStream.nextLongVector(nextBatchSize, secondsVector, nullVector); nanosStream.nextLongVector(nextBatchSize, nanosVector, nullVector); // merge seconds and nanos together for (int i = 0; i < nextBatchSize; i++) { if (nullVector[i]) { builder.appendNull(); } else { type.writeLong(builder, decodeTimestamp(secondsVector[i], nanosVector[i], baseTimestampInSeconds)); } } } else { for (int i = 0; i < nextBatchSize; i++) { builder.appendNull(); } } } readOffset = 0; nextBatchSize = 0; return builder.build(); } private void openRowGroup() throws IOException { presentStream = presentStreamSource.openStream(); secondsStream = secondsStreamSource.openStream(); nanosStream = nanosStreamSource.openStream(); rowGroupOpen = true; } @Override public void startStripe(InputStreamSources dictionaryStreamSources, List<ColumnEncoding> encoding) throws IOException { presentStreamSource = missingStreamSource(BooleanInputStream.class); secondsStreamSource = missingStreamSource(LongInputStream.class); nanosStreamSource = missingStreamSource(LongInputStream.class); readOffset = 0; nextBatchSize = 0; presentStream = null; secondsStream = null; nanosStream = null; rowGroupOpen = false; } @Override public void startRowGroup(InputStreamSources dataStreamSources) throws IOException { presentStreamSource = dataStreamSources.getInputStreamSource(streamDescriptor, PRESENT, BooleanInputStream.class); secondsStreamSource = dataStreamSources.getInputStreamSource(streamDescriptor, DATA, LongInputStream.class); nanosStreamSource = dataStreamSources.getInputStreamSource(streamDescriptor, SECONDARY, LongInputStream.class); readOffset = 0; nextBatchSize = 0; presentStream = null; secondsStream = null; nanosStream = null; rowGroupOpen = false; } @Override public String toString() { return toStringHelper(this) .addValue(streamDescriptor) .toString(); } // This comes from the Apache Hive ORC code public static long decodeTimestamp(long seconds, long serializedNanos, long baseTimestampInSeconds) { long millis = (seconds + baseTimestampInSeconds) * MILLIS_PER_SECOND; long nanos = parseNanos(serializedNanos); // the rounding error exists because java always rounds up when dividing integers // -42001/1000 = -42; and -42001 % 1000 = -1 (+ 1000) // to get the correct value we need // (-42 - 1)*1000 + 999 = -42001 // (42)*1000 + 1 = 42001 if (millis < 0 && nanos != 0) { millis -= 1000; } // Truncate nanos to millis and add to mills return millis + (nanos / 1_000_000); } // This comes from the Apache Hive ORC code private static int parseNanos(long serialized) { int zeros = ((int) serialized) & 0b111; int result = (int) (serialized >>> 3); if (zeros != 0) { for (int i = 0; i <= zeros; ++i) { result *= 10; } } return result; } }