/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.io.orc.encoded; import org.apache.orc.impl.RunLengthByteReader; import java.io.IOException; import java.util.List; import org.apache.hadoop.hive.common.io.encoded.EncodedColumnBatch; import org.apache.hadoop.hive.common.io.encoded.EncodedColumnBatch.ColumnStreamData; import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; import org.apache.hadoop.hive.ql.io.orc.encoded.Reader.OrcEncodedColumnBatch; import org.apache.orc.CompressionCodec; import org.apache.orc.TypeDescription; import org.apache.orc.TypeDescription.Category; import org.apache.orc.impl.PositionProvider; import org.apache.orc.impl.SettableUncompressedStream; import org.apache.orc.impl.TreeReaderFactory; import org.apache.orc.OrcProto; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class EncodedTreeReaderFactory extends TreeReaderFactory { private static final Logger LOG = LoggerFactory.getLogger(EncodedTreeReaderFactory.class); /** * We choose to use a toy programming language, so we cannot use multiple inheritance. * If we could, we could have this inherit TreeReader to contain the common impl, and then * have e.g. SettableIntTreeReader inherit both Settable... and Int.. TreeReader-s. * Instead, we have a settable interface that the caller will cast to and call setBuffers. */ public interface SettableTreeReader { void setBuffers(EncodedColumnBatch<OrcBatchKey> batch, boolean sameStripe) throws IOException; } public static class TimestampStreamReader extends TimestampTreeReader implements SettableTreeReader { private boolean isFileCompressed; private SettableUncompressedStream _presentStream; private SettableUncompressedStream _secondsStream; private SettableUncompressedStream _nanosStream; private List<ColumnVector> vectors; private int vectorIndex = 0; private TimestampStreamReader(int columnId, SettableUncompressedStream present, SettableUncompressedStream data, SettableUncompressedStream nanos, boolean isFileCompressed, OrcProto.ColumnEncoding encoding, TreeReaderFactory.Context context, List<ColumnVector> vectors) throws IOException { super(columnId, present, data, nanos, encoding, context); this.isFileCompressed = isFileCompressed; this._presentStream = present; this._secondsStream = data; this._nanosStream = nanos; this.vectors = vectors; } @Override public void nextVector( ColumnVector previousVector, boolean[] isNull, int batchSize) throws IOException { if (vectors == null) { super.nextVector(previousVector, isNull, batchSize); return; } // Note: we assume that batchSize will be consistent with vectors passed in. // This is rather brittle; same in other readers. vectors.get(vectorIndex++).shallowCopyTo(previousVector); if (vectorIndex == vectors.size()) { vectors = null; } } @Override public void seek(PositionProvider index) throws IOException { if (vectors != null) return; if (present != null) { if (isFileCompressed) { index.getNext(); } present.seek(index); } // data stream could be empty stream or already reached end of stream before present stream. // This can happen if all values in stream are nulls or last row group values are all null. if (_secondsStream.available() > 0) { if (isFileCompressed) { index.getNext(); } data.seek(index); } if (_nanosStream.available() > 0) { if (isFileCompressed) { index.getNext(); } nanos.seek(index); } } @Override public void setBuffers(EncodedColumnBatch<OrcBatchKey> batch, boolean sameStripe) throws IOException { // The situation here and in other readers is currently as such - setBuffers is never called // in SerDe reader case, and SerDe reader case is the only one that uses vector-s. // When the readers are created with vectors, streams are actually not created at all. // So, if we could have a set of vectors, then set of buffers, we'd be in trouble here; // we may need to implement that if this scenario is ever supported. assert vectors == null; ColumnStreamData[] streamsData = batch.getColumnData(columnId); if (_presentStream != null) { _presentStream.setBuffers(StreamUtils.createDiskRangeInfo(streamsData[OrcProto.Stream.Kind.PRESENT_VALUE])); } if (_secondsStream != null) { _secondsStream.setBuffers(StreamUtils.createDiskRangeInfo(streamsData[OrcProto.Stream.Kind.DATA_VALUE])); } if (_nanosStream != null) { _nanosStream.setBuffers(StreamUtils.createDiskRangeInfo(streamsData[OrcProto.Stream.Kind.SECONDARY_VALUE])); } } public void updateTimezone(String writerTimezoneId) throws IOException { base_timestamp = getBaseTimestamp(writerTimezoneId); } public static class StreamReaderBuilder { private int columnIndex; private ColumnStreamData presentStream; private ColumnStreamData dataStream; private ColumnStreamData nanosStream; private CompressionCodec compressionCodec; private OrcProto.ColumnEncoding columnEncoding; private TreeReaderFactory.Context context; private List<ColumnVector> vectors; public StreamReaderBuilder setColumnIndex(int columnIndex) { this.columnIndex = columnIndex; return this; } public StreamReaderBuilder setPresentStream(ColumnStreamData presentStream) { this.presentStream = presentStream; return this; } public StreamReaderBuilder setSecondsStream(ColumnStreamData dataStream) { this.dataStream = dataStream; return this; } public StreamReaderBuilder setNanosStream(ColumnStreamData secondaryStream) { this.nanosStream = secondaryStream; return this; } public StreamReaderBuilder setCompressionCodec(CompressionCodec compressionCodec) { this.compressionCodec = compressionCodec; return this; } public StreamReaderBuilder setColumnEncoding(OrcProto.ColumnEncoding encoding) { this.columnEncoding = encoding; return this; } public StreamReaderBuilder setContext(TreeReaderFactory.Context context) { this.context = context; return this; } public TimestampStreamReader build() throws IOException { SettableUncompressedStream present = StreamUtils .createSettableUncompressedStream(OrcProto.Stream.Kind.PRESENT.name(), presentStream); SettableUncompressedStream data = StreamUtils .createSettableUncompressedStream(OrcProto.Stream.Kind.DATA.name(), dataStream); SettableUncompressedStream nanos = StreamUtils .createSettableUncompressedStream(OrcProto.Stream.Kind.SECONDARY.name(), nanosStream); boolean isFileCompressed = compressionCodec != null; return new TimestampStreamReader(columnIndex, present, data, nanos, isFileCompressed, columnEncoding, context, vectors); } public StreamReaderBuilder setVectors(List<ColumnVector> vectors) { this.vectors = vectors; return this; } } public static StreamReaderBuilder builder() { return new StreamReaderBuilder(); } } protected static class StringStreamReader extends StringTreeReader implements SettableTreeReader { private boolean _isFileCompressed; private boolean _isDictionaryEncoding; private SettableUncompressedStream _presentStream; private SettableUncompressedStream _dataStream; private SettableUncompressedStream _lengthStream; private SettableUncompressedStream _dictionaryStream; private List<ColumnVector> vectors; private int vectorIndex = 0; private StringStreamReader(int columnId, SettableUncompressedStream present, SettableUncompressedStream data, SettableUncompressedStream length, SettableUncompressedStream dictionary, boolean isFileCompressed, OrcProto.ColumnEncoding encoding, TreeReaderFactory.Context context, List<ColumnVector> vectors) throws IOException { super(columnId, present, data, length, dictionary, encoding, context); this._isDictionaryEncoding = dictionary != null; this._isFileCompressed = isFileCompressed; this._presentStream = present; this._dataStream = data; this._lengthStream = length; this._dictionaryStream = dictionary; this.vectors = vectors; } @Override public void seek(PositionProvider[] index) throws IOException { // This string reader should simply redirect to its own seek (what other types already do). this.seek(index[columnId]); } @Override public void seek(PositionProvider index) throws IOException { if (vectors != null) return; if (present != null) { if (_isFileCompressed) { index.getNext(); } reader.getPresent().seek(index); } if (_isDictionaryEncoding) { // DICTIONARY encoding // data stream could be empty stream or already reached end of stream before present stream. // This can happen if all values in stream are nulls or last row group values are all null. if (_dataStream != null && _dataStream.available() > 0) { if (_isFileCompressed) { index.getNext(); } ((StringDictionaryTreeReader) reader).getReader().seek(index); } } else { // DIRECT encoding // data stream could be empty stream or already reached end of stream before present stream. // This can happen if all values in stream are nulls or last row group values are all null. if (_dataStream != null && _dataStream.available() > 0) { if (_isFileCompressed) { index.getNext(); } ((StringDirectTreeReader) reader).getStream().seek(index); } if (_lengthStream != null && _lengthStream.available() > 0) { if (_isFileCompressed) { index.getNext(); } ((StringDirectTreeReader) reader).getLengths().seek(index); } } } @Override public void nextVector( ColumnVector previousVector, boolean[] isNull, int batchSize) throws IOException { if (vectors == null) { super.nextVector(previousVector, isNull, batchSize); return; } vectors.get(vectorIndex++).shallowCopyTo(previousVector); if (vectorIndex == vectors.size()) { vectors = null; } } @Override public void setBuffers(EncodedColumnBatch<OrcBatchKey> batch, boolean sameStripe) throws IOException { assert vectors == null; // See the comment in TimestampStreamReader.setBuffers. ColumnStreamData[] streamsData = batch.getColumnData(columnId); if (_presentStream != null) { _presentStream.setBuffers(StreamUtils.createDiskRangeInfo(streamsData[OrcProto.Stream.Kind.PRESENT_VALUE])); } if (_dataStream != null) { _dataStream.setBuffers(StreamUtils.createDiskRangeInfo(streamsData[OrcProto.Stream.Kind.DATA_VALUE])); } if (!_isDictionaryEncoding) { if (_lengthStream != null) { _lengthStream.setBuffers(StreamUtils.createDiskRangeInfo(streamsData[OrcProto.Stream.Kind.LENGTH_VALUE])); } } // set these streams only if the stripe is different if (!sameStripe && _isDictionaryEncoding) { if (_lengthStream != null) { _lengthStream.setBuffers(StreamUtils.createDiskRangeInfo(streamsData[OrcProto.Stream.Kind.LENGTH_VALUE])); } if (_dictionaryStream != null) { _dictionaryStream.setBuffers(StreamUtils.createDiskRangeInfo(streamsData[OrcProto.Stream.Kind.DICTIONARY_DATA_VALUE])); } } } public static class StreamReaderBuilder { private int columnIndex; private ColumnStreamData presentStream; private ColumnStreamData dataStream; private ColumnStreamData dictionaryStream; private ColumnStreamData lengthStream; private CompressionCodec compressionCodec; private OrcProto.ColumnEncoding columnEncoding; private List<ColumnVector> vectors; private TreeReaderFactory.Context context; public StreamReaderBuilder setColumnIndex(int columnIndex) { this.columnIndex = columnIndex; return this; } public StreamReaderBuilder setPresentStream(ColumnStreamData presentStream) { this.presentStream = presentStream; return this; } public StreamReaderBuilder setDataStream(ColumnStreamData dataStream) { this.dataStream = dataStream; return this; } public StreamReaderBuilder setLengthStream(ColumnStreamData lengthStream) { this.lengthStream = lengthStream; return this; } public StreamReaderBuilder setDictionaryStream(ColumnStreamData dictStream) { this.dictionaryStream = dictStream; return this; } public StreamReaderBuilder setCompressionCodec(CompressionCodec compressionCodec) { this.compressionCodec = compressionCodec; return this; } public StreamReaderBuilder setColumnEncoding(OrcProto.ColumnEncoding encoding) { this.columnEncoding = encoding; return this; } public StreamReaderBuilder setContext(TreeReaderFactory.Context context) { this.context = context; return this; } public StringStreamReader build() throws IOException { SettableUncompressedStream present = StreamUtils .createSettableUncompressedStream(OrcProto.Stream.Kind.PRESENT.name(), presentStream); SettableUncompressedStream data = StreamUtils .createSettableUncompressedStream(OrcProto.Stream.Kind.DATA.name(), dataStream); SettableUncompressedStream length = StreamUtils .createSettableUncompressedStream(OrcProto.Stream.Kind.LENGTH.name(), lengthStream); SettableUncompressedStream dictionary = StreamUtils.createSettableUncompressedStream( OrcProto.Stream.Kind.DICTIONARY_DATA.name(), dictionaryStream); boolean isFileCompressed = compressionCodec != null; return new StringStreamReader(columnIndex, present, data, length, dictionary, isFileCompressed, columnEncoding, context, vectors); } public StreamReaderBuilder setVectors(List<ColumnVector> vectors) { this.vectors = vectors; return this; } } public static StreamReaderBuilder builder() { return new StreamReaderBuilder(); } } protected static class ShortStreamReader extends ShortTreeReader implements SettableTreeReader { private boolean isFileCompressed; private SettableUncompressedStream _presentStream; private SettableUncompressedStream _dataStream; private List<ColumnVector> vectors; private int vectorIndex = 0; private ShortStreamReader(int columnId, SettableUncompressedStream present, SettableUncompressedStream data, boolean isFileCompressed, OrcProto.ColumnEncoding encoding, TreeReaderFactory.Context context, List<ColumnVector> vectors) throws IOException { super(columnId, present, data, encoding, context); this.isFileCompressed = isFileCompressed; this._presentStream = present; this._dataStream = data; this.vectors = vectors; } @Override public void seek(PositionProvider index) throws IOException { if (vectors != null) return; if (present != null) { if (isFileCompressed) { index.getNext(); } present.seek(index); } // data stream could be empty stream or already reached end of stream before present stream. // This can happen if all values in stream are nulls or last row group values are all null. if (_dataStream.available() > 0) { if (isFileCompressed) { index.getNext(); } reader.seek(index); } } @Override public void nextVector( ColumnVector previousVector, boolean[] isNull, int batchSize) throws IOException { if (vectors == null) { super.nextVector(previousVector, isNull, batchSize); return; } vectors.get(vectorIndex++).shallowCopyTo(previousVector); if (vectorIndex == vectors.size()) { vectors = null; } } @Override public void setBuffers(EncodedColumnBatch<OrcBatchKey> batch, boolean sameStripe) throws IOException { assert vectors == null; // See the comment in TimestampStreamReader.setBuffers. ColumnStreamData[] streamsData = batch.getColumnData(columnId); if (_presentStream != null) { _presentStream.setBuffers(StreamUtils.createDiskRangeInfo(streamsData[OrcProto.Stream.Kind.PRESENT_VALUE])); } if (_dataStream != null) { _dataStream.setBuffers(StreamUtils.createDiskRangeInfo(streamsData[OrcProto.Stream.Kind.DATA_VALUE])); } } public static class StreamReaderBuilder { private int columnIndex; private ColumnStreamData presentStream; private ColumnStreamData dataStream; private CompressionCodec compressionCodec; private OrcProto.ColumnEncoding columnEncoding; private List<ColumnVector> vectors; private TreeReaderFactory.Context context; public StreamReaderBuilder setColumnIndex(int columnIndex) { this.columnIndex = columnIndex; return this; } public StreamReaderBuilder setPresentStream(ColumnStreamData presentStream) { this.presentStream = presentStream; return this; } public StreamReaderBuilder setDataStream(ColumnStreamData dataStream) { this.dataStream = dataStream; return this; } public StreamReaderBuilder setCompressionCodec(CompressionCodec compressionCodec) { this.compressionCodec = compressionCodec; return this; } public StreamReaderBuilder setColumnEncoding(OrcProto.ColumnEncoding encoding) { this.columnEncoding = encoding; return this; } public StreamReaderBuilder setContext(TreeReaderFactory.Context context) { this.context = context; return this; } public ShortStreamReader build() throws IOException { SettableUncompressedStream present = StreamUtils .createSettableUncompressedStream(OrcProto.Stream.Kind.PRESENT.name(), presentStream); SettableUncompressedStream data = StreamUtils .createSettableUncompressedStream(OrcProto.Stream.Kind.DATA.name(), dataStream); boolean isFileCompressed = compressionCodec != null; return new ShortStreamReader(columnIndex, present, data, isFileCompressed, columnEncoding, context, vectors); } public StreamReaderBuilder setVectors(List<ColumnVector> vectors) { this.vectors = vectors; return this; } } public static StreamReaderBuilder builder() { return new StreamReaderBuilder(); } } protected static class LongStreamReader extends LongTreeReader implements SettableTreeReader { private boolean _isFileCompressed; private SettableUncompressedStream _presentStream; private SettableUncompressedStream _dataStream; private List<ColumnVector> vectors; private int vectorIndex = 0; private LongStreamReader(int columnId, SettableUncompressedStream present, SettableUncompressedStream data, boolean isFileCompressed, OrcProto.ColumnEncoding encoding, TreeReaderFactory.Context context, List<ColumnVector> vectors) throws IOException { super(columnId, present, data, encoding, context); this._isFileCompressed = isFileCompressed; this._presentStream = present; this._dataStream = data; this.vectors = vectors; } @Override public void seek(PositionProvider index) throws IOException { if (vectors != null) return; if (present != null) { if (_isFileCompressed) { index.getNext(); } present.seek(index); } // data stream could be empty stream or already reached end of stream before present stream. // This can happen if all values in stream are nulls or last row group values are all null. if (_dataStream.available() > 0) { if (_isFileCompressed) { index.getNext(); } reader.seek(index); } } @Override public void nextVector( ColumnVector previousVector, boolean[] isNull, int batchSize) throws IOException { if (vectors == null) { super.nextVector(previousVector, isNull, batchSize); return; } vectors.get(vectorIndex++).shallowCopyTo(previousVector); if (vectorIndex == vectors.size()) { vectors = null; } } @Override public void setBuffers(EncodedColumnBatch<OrcBatchKey> batch, boolean sameStripe) throws IOException { assert vectors == null; // See the comment in TimestampStreamReader.setBuffers. ColumnStreamData[] streamsData = batch.getColumnData(columnId); if (_presentStream != null) { _presentStream.setBuffers(StreamUtils.createDiskRangeInfo(streamsData[OrcProto.Stream.Kind.PRESENT_VALUE])); } if (_dataStream != null) { _dataStream.setBuffers(StreamUtils.createDiskRangeInfo(streamsData[OrcProto.Stream.Kind.DATA_VALUE])); } } public static class StreamReaderBuilder { private int columnIndex; private ColumnStreamData presentStream; private ColumnStreamData dataStream; private CompressionCodec compressionCodec; private OrcProto.ColumnEncoding columnEncoding; private TreeReaderFactory.Context context; private List<ColumnVector> vectors; public StreamReaderBuilder setColumnIndex(int columnIndex) { this.columnIndex = columnIndex; return this; } public StreamReaderBuilder setPresentStream(ColumnStreamData presentStream) { this.presentStream = presentStream; return this; } public StreamReaderBuilder setDataStream(ColumnStreamData dataStream) { this.dataStream = dataStream; return this; } public StreamReaderBuilder setCompressionCodec(CompressionCodec compressionCodec) { this.compressionCodec = compressionCodec; return this; } public StreamReaderBuilder setColumnEncoding(OrcProto.ColumnEncoding encoding) { this.columnEncoding = encoding; return this; } public StreamReaderBuilder setContext(TreeReaderFactory.Context context) { this.context = context; return this; } public LongStreamReader build() throws IOException { SettableUncompressedStream present = StreamUtils .createSettableUncompressedStream(OrcProto.Stream.Kind.PRESENT.name(), presentStream); SettableUncompressedStream data = StreamUtils .createSettableUncompressedStream(OrcProto.Stream.Kind.DATA.name(), dataStream); boolean isFileCompressed = compressionCodec != null; return new LongStreamReader(columnIndex, present, data, isFileCompressed, columnEncoding, context, vectors); } public StreamReaderBuilder setVectors(List<ColumnVector> vectors) { this.vectors = vectors; return this; } } public static StreamReaderBuilder builder() { return new StreamReaderBuilder(); } } protected static class IntStreamReader extends IntTreeReader implements SettableTreeReader { private boolean _isFileCompressed; private SettableUncompressedStream _presentStream; private SettableUncompressedStream _dataStream; private List<ColumnVector> vectors; private int vectorIndex = 0; private IntStreamReader(int columnId, SettableUncompressedStream present, SettableUncompressedStream data, boolean isFileCompressed, OrcProto.ColumnEncoding encoding, TreeReaderFactory.Context context, List<ColumnVector> vectors) throws IOException { super(columnId, present, data, encoding, context); this._isFileCompressed = isFileCompressed; this._dataStream = data; this._presentStream = present; this.vectors = vectors; } @Override public void seek(PositionProvider index) throws IOException { if (vectors != null) return; if (present != null) { if (_isFileCompressed) { index.getNext(); } present.seek(index); } // data stream could be empty stream or already reached end of stream before present stream. // This can happen if all values in stream are nulls or last row group values are all null. if (_dataStream.available() > 0) { if (_isFileCompressed) { index.getNext(); } reader.seek(index); } } @Override public void nextVector( ColumnVector previousVector, boolean[] isNull, int batchSize) throws IOException { if (vectors == null) { super.nextVector(previousVector, isNull, batchSize); return; } vectors.get(vectorIndex++).shallowCopyTo(previousVector); if (vectorIndex == vectors.size()) { vectors = null; } } @Override public void setBuffers(EncodedColumnBatch<OrcBatchKey> batch, boolean sameStripe) throws IOException { assert vectors == null; // See the comment in TimestampStreamReader.setBuffers. ColumnStreamData[] streamsData = batch.getColumnData(columnId); if (_presentStream != null) { _presentStream.setBuffers(StreamUtils.createDiskRangeInfo(streamsData[OrcProto.Stream.Kind.PRESENT_VALUE])); } if (_dataStream != null) { _dataStream.setBuffers(StreamUtils.createDiskRangeInfo(streamsData[OrcProto.Stream.Kind.DATA_VALUE])); } } public static class StreamReaderBuilder { private int columnIndex; private ColumnStreamData presentStream; private ColumnStreamData dataStream; private CompressionCodec compressionCodec; private OrcProto.ColumnEncoding columnEncoding; private List<ColumnVector> vectors; private TreeReaderFactory.Context context; public StreamReaderBuilder setColumnIndex(int columnIndex) { this.columnIndex = columnIndex; return this; } public StreamReaderBuilder setPresentStream(ColumnStreamData presentStream) { this.presentStream = presentStream; return this; } public StreamReaderBuilder setDataStream(ColumnStreamData dataStream) { this.dataStream = dataStream; return this; } public StreamReaderBuilder setCompressionCodec(CompressionCodec compressionCodec) { this.compressionCodec = compressionCodec; return this; } public StreamReaderBuilder setColumnEncoding(OrcProto.ColumnEncoding encoding) { this.columnEncoding = encoding; return this; } public StreamReaderBuilder setContext(TreeReaderFactory.Context context) { this.context = context; return this; } public IntStreamReader build() throws IOException { SettableUncompressedStream present = StreamUtils .createSettableUncompressedStream(OrcProto.Stream.Kind.PRESENT.name(), presentStream); SettableUncompressedStream data = StreamUtils .createSettableUncompressedStream(OrcProto.Stream.Kind.DATA.name(), dataStream); boolean isFileCompressed = compressionCodec != null; return new IntStreamReader(columnIndex, present, data, isFileCompressed, columnEncoding, context, vectors); } public StreamReaderBuilder setVectors(List<ColumnVector> vectors) { this.vectors = vectors; return this; } } public static StreamReaderBuilder builder() { return new StreamReaderBuilder(); } } protected static class FloatStreamReader extends FloatTreeReader implements SettableTreeReader { private boolean _isFileCompressed; private SettableUncompressedStream _presentStream; private SettableUncompressedStream _dataStream; private List<ColumnVector> vectors; private int vectorIndex = 0; private FloatStreamReader(int columnId, SettableUncompressedStream present, SettableUncompressedStream data, boolean isFileCompressed, List<ColumnVector> vectors) throws IOException { super(columnId, present, data); this._isFileCompressed = isFileCompressed; this._presentStream = present; this._dataStream = data; this.vectors = vectors; } @Override public void seek(PositionProvider index) throws IOException { if (vectors != null) return; if (present != null) { if (_isFileCompressed) { index.getNext(); } present.seek(index); } // data stream could be empty stream or already reached end of stream before present stream. // This can happen if all values in stream are nulls or last row group values are all null. if (_dataStream.available() > 0) { if (_isFileCompressed) { index.getNext(); } stream.seek(index); } } @Override public void nextVector( ColumnVector previousVector, boolean[] isNull, int batchSize) throws IOException { if (vectors == null) { super.nextVector(previousVector, isNull, batchSize); return; } vectors.get(vectorIndex++).shallowCopyTo(previousVector); if (vectorIndex == vectors.size()) { vectors = null; } } @Override public void setBuffers(EncodedColumnBatch<OrcBatchKey> batch, boolean sameStripe) throws IOException { assert vectors == null; // See the comment in TimestampStreamReader.setBuffers. ColumnStreamData[] streamsData = batch.getColumnData(columnId); if (_presentStream != null) { _presentStream.setBuffers(StreamUtils.createDiskRangeInfo(streamsData[OrcProto.Stream.Kind.PRESENT_VALUE])); } if (_dataStream != null) { _dataStream.setBuffers(StreamUtils.createDiskRangeInfo(streamsData[OrcProto.Stream.Kind.DATA_VALUE])); } } public static class StreamReaderBuilder { private int columnIndex; private ColumnStreamData presentStream; private ColumnStreamData dataStream; private CompressionCodec compressionCodec; private List<ColumnVector> vectors; public StreamReaderBuilder setColumnIndex(int columnIndex) { this.columnIndex = columnIndex; return this; } public StreamReaderBuilder setPresentStream(ColumnStreamData presentStream) { this.presentStream = presentStream; return this; } public StreamReaderBuilder setDataStream(ColumnStreamData dataStream) { this.dataStream = dataStream; return this; } public StreamReaderBuilder setCompressionCodec(CompressionCodec compressionCodec) { this.compressionCodec = compressionCodec; return this; } public FloatStreamReader build() throws IOException { SettableUncompressedStream present = StreamUtils .createSettableUncompressedStream(OrcProto.Stream.Kind.PRESENT.name(), presentStream); SettableUncompressedStream data = StreamUtils .createSettableUncompressedStream(OrcProto.Stream.Kind.DATA.name(), dataStream); boolean isFileCompressed = compressionCodec != null; return new FloatStreamReader(columnIndex, present, data, isFileCompressed, vectors); } public StreamReaderBuilder setVectors(List<ColumnVector> vectors) { this.vectors = vectors; return this; } } public static StreamReaderBuilder builder() { return new StreamReaderBuilder(); } } protected static class DoubleStreamReader extends DoubleTreeReader implements SettableTreeReader { private boolean _isFileCompressed; private SettableUncompressedStream _presentStream; private SettableUncompressedStream _dataStream; private List<ColumnVector> vectors; private int vectorIndex = 0; private DoubleStreamReader(int columnId, SettableUncompressedStream present, SettableUncompressedStream data, boolean isFileCompressed, List<ColumnVector> vectors) throws IOException { super(columnId, present, data); this._isFileCompressed = isFileCompressed; this._presentStream = present; this._dataStream = data; this.vectors = vectors; } @Override public void seek(PositionProvider index) throws IOException { if (vectors != null) return; if (present != null) { if (_isFileCompressed) { index.getNext(); } present.seek(index); } // data stream could be empty stream or already reached end of stream before present stream. // This can happen if all values in stream are nulls or last row group values are all null. if (_dataStream.available() > 0) { if (_isFileCompressed) { index.getNext(); } stream.seek(index); } } @Override public void nextVector( ColumnVector previousVector, boolean[] isNull, int batchSize) throws IOException { if (vectors == null) { super.nextVector(previousVector, isNull, batchSize); return; } vectors.get(vectorIndex++).shallowCopyTo(previousVector); if (vectorIndex == vectors.size()) { vectors = null; } } @Override public void setBuffers(EncodedColumnBatch<OrcBatchKey> batch, boolean sameStripe) throws IOException { assert vectors == null; // See the comment in TimestampStreamReader.setBuffers. ColumnStreamData[] streamsData = batch.getColumnData(columnId); if (_presentStream != null) { _presentStream.setBuffers(StreamUtils.createDiskRangeInfo(streamsData[OrcProto.Stream.Kind.PRESENT_VALUE])); } if (_dataStream != null) { _dataStream.setBuffers(StreamUtils.createDiskRangeInfo(streamsData[OrcProto.Stream.Kind.DATA_VALUE])); } } public static class StreamReaderBuilder { private int columnIndex; private ColumnStreamData presentStream; private ColumnStreamData dataStream; private CompressionCodec compressionCodec; private List<ColumnVector> vectors; private TreeReaderFactory.Context context; public StreamReaderBuilder setColumnIndex(int columnIndex) { this.columnIndex = columnIndex; return this; } public StreamReaderBuilder setPresentStream(ColumnStreamData presentStream) { this.presentStream = presentStream; return this; } public StreamReaderBuilder setDataStream(ColumnStreamData dataStream) { this.dataStream = dataStream; return this; } public StreamReaderBuilder setCompressionCodec(CompressionCodec compressionCodec) { this.compressionCodec = compressionCodec; return this; } public StreamReaderBuilder setContext(TreeReaderFactory.Context context) { this.context = context; return this; } public DoubleStreamReader build() throws IOException { SettableUncompressedStream present = StreamUtils .createSettableUncompressedStream(OrcProto.Stream.Kind.PRESENT.name(), presentStream); SettableUncompressedStream data = StreamUtils .createSettableUncompressedStream(OrcProto.Stream.Kind.DATA.name(), dataStream); boolean isFileCompressed = compressionCodec != null; // TODO: why doesn't this use context? return new DoubleStreamReader(columnIndex, present, data, isFileCompressed, vectors); } public StreamReaderBuilder setVectors(List<ColumnVector> vectors) { this.vectors = vectors; return this; } } public static StreamReaderBuilder builder() { return new StreamReaderBuilder(); } } protected static class DecimalStreamReader extends DecimalTreeReader implements SettableTreeReader { private boolean _isFileCompressed; private SettableUncompressedStream _presentStream; private SettableUncompressedStream _valueStream; private SettableUncompressedStream _scaleStream; private List<ColumnVector> vectors; private int vectorIndex = 0; private DecimalStreamReader(int columnId, int precision, int scale, SettableUncompressedStream presentStream, SettableUncompressedStream valueStream, SettableUncompressedStream scaleStream, boolean isFileCompressed, OrcProto.ColumnEncoding encoding, TreeReaderFactory.Context context, List<ColumnVector> vectors) throws IOException { super(columnId, presentStream, valueStream, scaleStream, encoding, context); this._isFileCompressed = isFileCompressed; this._presentStream = presentStream; this._valueStream = valueStream; this._scaleStream = scaleStream; this.vectors = vectors; } @Override public void seek(PositionProvider index) throws IOException { if (vectors != null) return; if (present != null) { if (_isFileCompressed) { index.getNext(); } present.seek(index); } // data stream could be empty stream or already reached end of stream before present stream. // This can happen if all values in stream are nulls or last row group values are all null. if (_valueStream.available() > 0) { if (_isFileCompressed) { index.getNext(); } valueStream.seek(index); } if (_scaleStream.available() > 0) { if (_isFileCompressed) { index.getNext(); } scaleReader.seek(index); } } @Override public void nextVector( ColumnVector previousVector, boolean[] isNull, int batchSize) throws IOException { if (vectors == null) { super.nextVector(previousVector, isNull, batchSize); return; } vectors.get(vectorIndex++).shallowCopyTo(previousVector); if (vectorIndex == vectors.size()) { vectors = null; } } @Override public void setBuffers(EncodedColumnBatch<OrcBatchKey> batch, boolean sameStripe) throws IOException { assert vectors == null; // See the comment in TimestampStreamReader.setBuffers. ColumnStreamData[] streamsData = batch.getColumnData(columnId); if (_presentStream != null) { _presentStream.setBuffers(StreamUtils.createDiskRangeInfo(streamsData[OrcProto.Stream.Kind.PRESENT_VALUE])); } if (_valueStream != null) { _valueStream.setBuffers(StreamUtils.createDiskRangeInfo(streamsData[OrcProto.Stream.Kind.DATA_VALUE])); } if (_scaleStream != null) { _scaleStream.setBuffers(StreamUtils.createDiskRangeInfo(streamsData[OrcProto.Stream.Kind.SECONDARY_VALUE])); } } public static class StreamReaderBuilder { private int columnIndex; private ColumnStreamData presentStream; private ColumnStreamData valueStream; private ColumnStreamData scaleStream; private int scale; private int precision; private CompressionCodec compressionCodec; private OrcProto.ColumnEncoding columnEncoding; private List<ColumnVector> vectors; private TreeReaderFactory.Context context; public StreamReaderBuilder setColumnIndex(int columnIndex) { this.columnIndex = columnIndex; return this; } public StreamReaderBuilder setPrecision(int precision) { this.precision = precision; return this; } public StreamReaderBuilder setScale(int scale) { this.scale = scale; return this; } public StreamReaderBuilder setContext(TreeReaderFactory.Context context) { this.context = context; return this; } public StreamReaderBuilder setPresentStream(ColumnStreamData presentStream) { this.presentStream = presentStream; return this; } public StreamReaderBuilder setValueStream(ColumnStreamData valueStream) { this.valueStream = valueStream; return this; } public StreamReaderBuilder setScaleStream(ColumnStreamData scaleStream) { this.scaleStream = scaleStream; return this; } public StreamReaderBuilder setCompressionCodec(CompressionCodec compressionCodec) { this.compressionCodec = compressionCodec; return this; } public StreamReaderBuilder setColumnEncoding(OrcProto.ColumnEncoding encoding) { this.columnEncoding = encoding; return this; } public DecimalStreamReader build() throws IOException { SettableUncompressedStream presentInStream = StreamUtils.createSettableUncompressedStream( OrcProto.Stream.Kind.PRESENT.name(), presentStream); SettableUncompressedStream valueInStream = StreamUtils.createSettableUncompressedStream( OrcProto.Stream.Kind.DATA.name(), valueStream); SettableUncompressedStream scaleInStream = StreamUtils.createSettableUncompressedStream( OrcProto.Stream.Kind.SECONDARY.name(), scaleStream); boolean isFileCompressed = compressionCodec != null; return new DecimalStreamReader(columnIndex, precision, scale, presentInStream, valueInStream, scaleInStream, isFileCompressed, columnEncoding, context, vectors); } public StreamReaderBuilder setVectors(List<ColumnVector> vectors) { this.vectors = vectors; return this; } } public static StreamReaderBuilder builder() { return new StreamReaderBuilder(); } } protected static class DateStreamReader extends DateTreeReader implements SettableTreeReader { private boolean isFileCompressed; private SettableUncompressedStream _presentStream; private SettableUncompressedStream _dataStream; private List<ColumnVector> vectors; private int vectorIndex = 0; private DateStreamReader(int columnId, SettableUncompressedStream present, SettableUncompressedStream data, boolean isFileCompressed, OrcProto.ColumnEncoding encoding, TreeReaderFactory.Context context, List<ColumnVector> vectors ) throws IOException { super(columnId, present, data, encoding, context); this.isFileCompressed = isFileCompressed; this._presentStream = present; this._dataStream = data; this.vectors = vectors; } @Override public void seek(PositionProvider index) throws IOException { if (vectors != null) return; if (present != null) { if (isFileCompressed) { index.getNext(); } present.seek(index); } // data stream could be empty stream or already reached end of stream before present stream. // This can happen if all values in stream are nulls or last row group values are all null. if (_dataStream.available() > 0) { if (isFileCompressed) { index.getNext(); } reader.seek(index); } } @Override public void nextVector( ColumnVector previousVector, boolean[] isNull, int batchSize) throws IOException { if (vectors == null) { super.nextVector(previousVector, isNull, batchSize); return; } vectors.get(vectorIndex++).shallowCopyTo(previousVector); if (vectorIndex == vectors.size()) { vectors = null; } } @Override public void setBuffers(EncodedColumnBatch<OrcBatchKey> batch, boolean sameStripe) throws IOException { assert vectors == null; // See the comment in TimestampStreamReader.setBuffers. ColumnStreamData[] streamsData = batch.getColumnData(columnId); if (_presentStream != null) { _presentStream.setBuffers(StreamUtils.createDiskRangeInfo(streamsData[OrcProto.Stream.Kind.PRESENT_VALUE])); } if (_dataStream != null) { _dataStream.setBuffers(StreamUtils.createDiskRangeInfo(streamsData[OrcProto.Stream.Kind.DATA_VALUE])); } } public static class StreamReaderBuilder { private int columnIndex; private ColumnStreamData presentStream; private ColumnStreamData dataStream; private CompressionCodec compressionCodec; private OrcProto.ColumnEncoding columnEncoding; private List<ColumnVector> vectors; private TreeReaderFactory.Context context; public StreamReaderBuilder setColumnIndex(int columnIndex) { this.columnIndex = columnIndex; return this; } public StreamReaderBuilder setPresentStream(ColumnStreamData presentStream) { this.presentStream = presentStream; return this; } public StreamReaderBuilder setDataStream(ColumnStreamData dataStream) { this.dataStream = dataStream; return this; } public StreamReaderBuilder setCompressionCodec(CompressionCodec compressionCodec) { this.compressionCodec = compressionCodec; return this; } public StreamReaderBuilder setContext(TreeReaderFactory.Context context) { this.context = context; return this; } public StreamReaderBuilder setColumnEncoding(OrcProto.ColumnEncoding encoding) { this.columnEncoding = encoding; return this; } public StreamReaderBuilder setVectors(List<ColumnVector> vectors) { this.vectors = vectors; return this; } public DateStreamReader build() throws IOException { SettableUncompressedStream present = StreamUtils .createSettableUncompressedStream(OrcProto.Stream.Kind.PRESENT.name(), presentStream); SettableUncompressedStream data = StreamUtils .createSettableUncompressedStream(OrcProto.Stream.Kind.DATA.name(), dataStream); boolean isFileCompressed = compressionCodec != null; return new DateStreamReader(columnIndex, present, data, isFileCompressed, columnEncoding, context, vectors); } } public static StreamReaderBuilder builder() { return new StreamReaderBuilder(); } } protected static class CharStreamReader extends CharTreeReader implements SettableTreeReader { private boolean _isFileCompressed; private boolean _isDictionaryEncoding; private SettableUncompressedStream _presentStream; private SettableUncompressedStream _dataStream; private SettableUncompressedStream _lengthStream; private SettableUncompressedStream _dictionaryStream; private List<ColumnVector> vectors; private int vectorIndex = 0; private CharStreamReader(int columnId, int maxLength, SettableUncompressedStream present, SettableUncompressedStream data, SettableUncompressedStream length, SettableUncompressedStream dictionary, boolean isFileCompressed, OrcProto.ColumnEncoding encoding, List<ColumnVector> vectors) throws IOException { super(columnId, maxLength, present, data, length, dictionary, encoding); this._isDictionaryEncoding = dictionary != null; this._isFileCompressed = isFileCompressed; this._presentStream = present; this._dataStream = data; this._lengthStream = length; this._dictionaryStream = dictionary; this.vectors = vectors; } @Override public void seek(PositionProvider[] index) throws IOException { // This string reader should simply redirect to its own seek (what other types already do). this.seek(index[columnId]); } @Override public void seek(PositionProvider index) throws IOException { if (vectors != null) return; if (present != null) { if (_isFileCompressed) { index.getNext(); } reader.getPresent().seek(index); } if (_isDictionaryEncoding) { // DICTIONARY encoding // data stream could be empty stream or already reached end of stream before present stream. // This can happen if all values in stream are nulls or last row group values are all null. if (_dataStream.available() > 0) { if (_isFileCompressed) { index.getNext(); } ((StringDictionaryTreeReader) reader).getReader().seek(index); } } else { // DIRECT encoding // data stream could be empty stream or already reached end of stream before present stream. // This can happen if all values in stream are nulls or last row group values are all null. if (_dataStream.available() > 0) { if (_isFileCompressed) { index.getNext(); } ((StringDirectTreeReader) reader).getStream().seek(index); } if (_lengthStream.available() > 0) { if (_isFileCompressed) { index.getNext(); } ((StringDirectTreeReader) reader).getLengths().seek(index); } } } @Override public void nextVector( ColumnVector previousVector, boolean[] isNull, int batchSize) throws IOException { if (vectors == null) { super.nextVector(previousVector, isNull, batchSize); return; } vectors.get(vectorIndex++).shallowCopyTo(previousVector); if (vectorIndex == vectors.size()) { vectors = null; } } @Override public void setBuffers(EncodedColumnBatch<OrcBatchKey> batch, boolean sameStripe) throws IOException { assert vectors == null; // See the comment in TimestampStreamReader.setBuffers. ColumnStreamData[] streamsData = batch.getColumnData(columnId); if (_presentStream != null) { _presentStream.setBuffers(StreamUtils.createDiskRangeInfo(streamsData[OrcProto.Stream.Kind.PRESENT_VALUE])); } if (_dataStream != null) { _dataStream.setBuffers(StreamUtils.createDiskRangeInfo(streamsData[OrcProto.Stream.Kind.DATA_VALUE])); } if (!_isDictionaryEncoding) { if (_lengthStream != null) { _lengthStream.setBuffers(StreamUtils.createDiskRangeInfo(streamsData[OrcProto.Stream.Kind.LENGTH_VALUE])); } } // set these streams only if the stripe is different if (!sameStripe && _isDictionaryEncoding) { if (_lengthStream != null) { _lengthStream.setBuffers(StreamUtils.createDiskRangeInfo(streamsData[OrcProto.Stream.Kind.LENGTH_VALUE])); } if (_dictionaryStream != null) { _dictionaryStream.setBuffers(StreamUtils.createDiskRangeInfo(streamsData[OrcProto.Stream.Kind.DICTIONARY_DATA_VALUE])); } } } public static class StreamReaderBuilder { private int columnIndex; private int maxLength; private ColumnStreamData presentStream; private ColumnStreamData dataStream; private ColumnStreamData dictionaryStream; private ColumnStreamData lengthStream; private CompressionCodec compressionCodec; private OrcProto.ColumnEncoding columnEncoding; private List<ColumnVector> vectors; public StreamReaderBuilder setColumnIndex(int columnIndex) { this.columnIndex = columnIndex; return this; } public StreamReaderBuilder setMaxLength(int maxLength) { this.maxLength = maxLength; return this; } public StreamReaderBuilder setPresentStream(ColumnStreamData presentStream) { this.presentStream = presentStream; return this; } public StreamReaderBuilder setDataStream(ColumnStreamData dataStream) { this.dataStream = dataStream; return this; } public StreamReaderBuilder setLengthStream(ColumnStreamData lengthStream) { this.lengthStream = lengthStream; return this; } public StreamReaderBuilder setDictionaryStream(ColumnStreamData dictStream) { this.dictionaryStream = dictStream; return this; } public StreamReaderBuilder setCompressionCodec(CompressionCodec compressionCodec) { this.compressionCodec = compressionCodec; return this; } public StreamReaderBuilder setColumnEncoding(OrcProto.ColumnEncoding encoding) { this.columnEncoding = encoding; return this; } public CharStreamReader build() throws IOException { SettableUncompressedStream present = StreamUtils .createSettableUncompressedStream(OrcProto.Stream.Kind.PRESENT.name(), presentStream); SettableUncompressedStream data = StreamUtils .createSettableUncompressedStream(OrcProto.Stream.Kind.DATA.name(), dataStream); SettableUncompressedStream length = StreamUtils .createSettableUncompressedStream(OrcProto.Stream.Kind.LENGTH.name(), lengthStream); SettableUncompressedStream dictionary = StreamUtils.createSettableUncompressedStream( OrcProto.Stream.Kind.DICTIONARY_DATA.name(), dictionaryStream); boolean isFileCompressed = compressionCodec != null; return new CharStreamReader(columnIndex, maxLength, present, data, length, dictionary, isFileCompressed, columnEncoding, vectors); } public StreamReaderBuilder setVectors(List<ColumnVector> vectors) { this.vectors = vectors; return this; } } public static StreamReaderBuilder builder() { return new StreamReaderBuilder(); } } protected static class VarcharStreamReader extends VarcharTreeReader implements SettableTreeReader { private boolean _isFileCompressed; private boolean _isDictionaryEncoding; private SettableUncompressedStream _presentStream; private SettableUncompressedStream _dataStream; private SettableUncompressedStream _lengthStream; private SettableUncompressedStream _dictionaryStream; private List<ColumnVector> vectors; private int vectorIndex = 0; private VarcharStreamReader(int columnId, int maxLength, SettableUncompressedStream present, SettableUncompressedStream data, SettableUncompressedStream length, SettableUncompressedStream dictionary, boolean isFileCompressed, OrcProto.ColumnEncoding encoding, List<ColumnVector> vectors) throws IOException { super(columnId, maxLength, present, data, length, dictionary, encoding); this._isDictionaryEncoding = dictionary != null; this._isFileCompressed = isFileCompressed; this._presentStream = present; this._dataStream = data; this._lengthStream = length; this._dictionaryStream = dictionary; this.vectors = vectors; } @Override public void seek(PositionProvider[] index) throws IOException { // This string reader should simply redirect to its own seek (what other types already do). this.seek(index[columnId]); } @Override public void seek(PositionProvider index) throws IOException { if (vectors != null) return; if (present != null) { if (_isFileCompressed) { index.getNext(); } reader.getPresent().seek(index); } if (_isDictionaryEncoding) { // DICTIONARY encoding // data stream could be empty stream or already reached end of stream before present stream. // This can happen if all values in stream are nulls or last row group values are all null. if (_dataStream.available() > 0) { if (_isFileCompressed) { index.getNext(); } ((StringDictionaryTreeReader) reader).getReader().seek(index); } } else { // DIRECT encoding // data stream could be empty stream or already reached end of stream before present stream. // This can happen if all values in stream are nulls or last row group values are all null. if (_dataStream.available() > 0) { if (_isFileCompressed) { index.getNext(); } ((StringDirectTreeReader) reader).getStream().seek(index); } if (_lengthStream.available() > 0) { if (_isFileCompressed) { index.getNext(); } ((StringDirectTreeReader) reader).getLengths().seek(index); } } } @Override public void nextVector( ColumnVector previousVector, boolean[] isNull, int batchSize) throws IOException { if (vectors == null) { super.nextVector(previousVector, isNull, batchSize); return; } vectors.get(vectorIndex++).shallowCopyTo(previousVector); if (vectorIndex == vectors.size()) { vectors = null; } } @Override public void setBuffers(EncodedColumnBatch<OrcBatchKey> batch, boolean sameStripe) throws IOException { assert vectors == null; // See the comment in TimestampStreamReader.setBuffers. ColumnStreamData[] streamsData = batch.getColumnData(columnId); if (_presentStream != null) { _presentStream.setBuffers(StreamUtils.createDiskRangeInfo(streamsData[OrcProto.Stream.Kind.PRESENT_VALUE])); } if (_dataStream != null) { _dataStream.setBuffers(StreamUtils.createDiskRangeInfo(streamsData[OrcProto.Stream.Kind.DATA_VALUE])); } if (!_isDictionaryEncoding) { if (_lengthStream != null) { _lengthStream.setBuffers(StreamUtils.createDiskRangeInfo(streamsData[OrcProto.Stream.Kind.LENGTH_VALUE])); } } // set these streams only if the stripe is different if (!sameStripe && _isDictionaryEncoding) { if (_lengthStream != null) { _lengthStream.setBuffers(StreamUtils.createDiskRangeInfo(streamsData[OrcProto.Stream.Kind.LENGTH_VALUE])); } if (_dictionaryStream != null) { _dictionaryStream.setBuffers(StreamUtils.createDiskRangeInfo(streamsData[OrcProto.Stream.Kind.DICTIONARY_DATA_VALUE])); } } } public static class StreamReaderBuilder { private int columnIndex; private int maxLength; private ColumnStreamData presentStream; private ColumnStreamData dataStream; private ColumnStreamData dictionaryStream; private ColumnStreamData lengthStream; private CompressionCodec compressionCodec; private OrcProto.ColumnEncoding columnEncoding; private List<ColumnVector> vectors; public StreamReaderBuilder setColumnIndex(int columnIndex) { this.columnIndex = columnIndex; return this; } public StreamReaderBuilder setMaxLength(int maxLength) { this.maxLength = maxLength; return this; } public StreamReaderBuilder setPresentStream(ColumnStreamData presentStream) { this.presentStream = presentStream; return this; } public StreamReaderBuilder setDataStream(ColumnStreamData dataStream) { this.dataStream = dataStream; return this; } public StreamReaderBuilder setLengthStream(ColumnStreamData lengthStream) { this.lengthStream = lengthStream; return this; } public StreamReaderBuilder setDictionaryStream(ColumnStreamData dictStream) { this.dictionaryStream = dictStream; return this; } public StreamReaderBuilder setCompressionCodec(CompressionCodec compressionCodec) { this.compressionCodec = compressionCodec; return this; } public StreamReaderBuilder setColumnEncoding(OrcProto.ColumnEncoding encoding) { this.columnEncoding = encoding; return this; } public VarcharStreamReader build() throws IOException { SettableUncompressedStream present = StreamUtils .createSettableUncompressedStream(OrcProto.Stream.Kind.PRESENT.name(), presentStream); SettableUncompressedStream data = StreamUtils .createSettableUncompressedStream(OrcProto.Stream.Kind.DATA.name(), dataStream); SettableUncompressedStream length = StreamUtils .createSettableUncompressedStream(OrcProto.Stream.Kind.LENGTH.name(), lengthStream); SettableUncompressedStream dictionary = StreamUtils.createSettableUncompressedStream( OrcProto.Stream.Kind.DICTIONARY_DATA.name(), dictionaryStream); boolean isFileCompressed = compressionCodec != null; return new VarcharStreamReader(columnIndex, maxLength, present, data, length, dictionary, isFileCompressed, columnEncoding, vectors); } public StreamReaderBuilder setVectors(List<ColumnVector> vectors) { this.vectors = vectors; return this; } } public static StreamReaderBuilder builder() { return new StreamReaderBuilder(); } } protected static class ByteStreamReader extends ByteTreeReader implements SettableTreeReader { private boolean _isFileCompressed; private SettableUncompressedStream _presentStream; private SettableUncompressedStream _dataStream; private List<ColumnVector> vectors; private int vectorIndex = 0; private ByteStreamReader(int columnId, SettableUncompressedStream present, SettableUncompressedStream data, boolean isFileCompressed, List<ColumnVector> vectors) throws IOException { super(columnId, present, data); this._isFileCompressed = isFileCompressed; this._presentStream = present; this._dataStream = data; this.vectors = vectors; } @Override public void seek(PositionProvider index) throws IOException { if (vectors != null) return; if (present != null) { if (_isFileCompressed) { index.getNext(); } present.seek(index); } // data stream could be empty stream or already reached end of stream before present stream. // This can happen if all values in stream are nulls or last row group values are all null. if (_dataStream.available() > 0) { if (_isFileCompressed) { index.getNext(); } reader.seek(index); } } @Override public void nextVector( ColumnVector previousVector, boolean[] isNull, int batchSize) throws IOException { if (vectors == null) { super.nextVector(previousVector, isNull, batchSize); return; } vectors.get(vectorIndex++).shallowCopyTo(previousVector); if (vectorIndex == vectors.size()) { vectors = null; } } @Override public void setBuffers(EncodedColumnBatch<OrcBatchKey> batch, boolean sameStripe) throws IOException { assert vectors == null; // See the comment in TimestampStreamReader.setBuffers. ColumnStreamData[] streamsData = batch.getColumnData(columnId); if (_presentStream != null) { _presentStream.setBuffers(StreamUtils.createDiskRangeInfo(streamsData[OrcProto.Stream.Kind.PRESENT_VALUE])); } if (_dataStream != null) { _dataStream.setBuffers(StreamUtils.createDiskRangeInfo(streamsData[OrcProto.Stream.Kind.DATA_VALUE])); } } public static class StreamReaderBuilder { private int columnIndex; private ColumnStreamData presentStream; private ColumnStreamData dataStream; private CompressionCodec compressionCodec; private List<ColumnVector> vectors; public StreamReaderBuilder setColumnIndex(int columnIndex) { this.columnIndex = columnIndex; return this; } public StreamReaderBuilder setPresentStream(ColumnStreamData presentStream) { this.presentStream = presentStream; return this; } public StreamReaderBuilder setDataStream(ColumnStreamData dataStream) { this.dataStream = dataStream; return this; } public StreamReaderBuilder setCompressionCodec(CompressionCodec compressionCodec) { this.compressionCodec = compressionCodec; return this; } public ByteStreamReader build() throws IOException { SettableUncompressedStream present = StreamUtils .createSettableUncompressedStream(OrcProto.Stream.Kind.PRESENT.name(), presentStream); SettableUncompressedStream data = StreamUtils .createSettableUncompressedStream(OrcProto.Stream.Kind.DATA.name(), dataStream); boolean isFileCompressed = compressionCodec != null; return new ByteStreamReader(columnIndex, present, data, isFileCompressed, vectors); } public StreamReaderBuilder setVectors(List<ColumnVector> vectors) { this.vectors = vectors; return this; } } public static StreamReaderBuilder builder() { return new StreamReaderBuilder(); } } protected static class BinaryStreamReader extends BinaryTreeReader implements SettableTreeReader { private boolean _isFileCompressed; private SettableUncompressedStream _presentStream; private SettableUncompressedStream _dataStream; private SettableUncompressedStream _lengthsStream; private List<ColumnVector> vectors; private int vectorIndex = 0; private BinaryStreamReader(int columnId, SettableUncompressedStream present, SettableUncompressedStream data, SettableUncompressedStream length, boolean isFileCompressed, OrcProto.ColumnEncoding encoding, TreeReaderFactory.Context context, List<ColumnVector> vectors) throws IOException { super(columnId, present, data, length, encoding, context); this._isFileCompressed = isFileCompressed; this._presentStream = present; this._dataStream = data; this._lengthsStream = length; this.vectors = vectors; } @Override public void nextVector( ColumnVector previousVector, boolean[] isNull, int batchSize) throws IOException { if (vectors == null) { super.nextVector(previousVector, isNull, batchSize); return; } vectors.get(vectorIndex++).shallowCopyTo(previousVector); if (vectorIndex == vectors.size()) { vectors = null; } } @Override public void seek(PositionProvider index) throws IOException { if (vectors != null) return; if (present != null) { if (_isFileCompressed) { index.getNext(); } present.seek(index); } // data stream could be empty stream or already reached end of stream before present stream. // This can happen if all values in stream are nulls or last row group values are all null. if (_dataStream.available() > 0) { if (_isFileCompressed) { index.getNext(); } stream.seek(index); } if (lengths != null && _lengthsStream.available() > 0) { if (_isFileCompressed) { index.getNext(); } lengths.seek(index); } } @Override public void setBuffers(EncodedColumnBatch<OrcBatchKey> batch, boolean sameStripe) throws IOException { assert vectors == null; // See the comment in TimestampStreamReader.setBuffers. ColumnStreamData[] streamsData = batch.getColumnData(columnId); if (_presentStream != null) { _presentStream.setBuffers(StreamUtils.createDiskRangeInfo(streamsData[OrcProto.Stream.Kind.PRESENT_VALUE])); } if (_dataStream != null) { _dataStream.setBuffers(StreamUtils.createDiskRangeInfo(streamsData[OrcProto.Stream.Kind.DATA_VALUE])); } if (_lengthsStream != null) { _lengthsStream.setBuffers(StreamUtils.createDiskRangeInfo(streamsData[OrcProto.Stream.Kind.LENGTH_VALUE])); } } public static class StreamReaderBuilder { private int columnIndex; private ColumnStreamData presentStream; private ColumnStreamData dataStream; private ColumnStreamData lengthStream; private CompressionCodec compressionCodec; private OrcProto.ColumnEncoding columnEncoding; private List<ColumnVector> vectors; private TreeReaderFactory.Context context; public StreamReaderBuilder setColumnIndex(int columnIndex) { this.columnIndex = columnIndex; return this; } public StreamReaderBuilder setPresentStream(ColumnStreamData presentStream) { this.presentStream = presentStream; return this; } public StreamReaderBuilder setDataStream(ColumnStreamData dataStream) { this.dataStream = dataStream; return this; } public StreamReaderBuilder setLengthStream(ColumnStreamData secondaryStream) { this.lengthStream = secondaryStream; return this; } public StreamReaderBuilder setCompressionCodec(CompressionCodec compressionCodec) { this.compressionCodec = compressionCodec; return this; } public StreamReaderBuilder setColumnEncoding(OrcProto.ColumnEncoding encoding) { this.columnEncoding = encoding; return this; } public StreamReaderBuilder setContext(TreeReaderFactory.Context context) { this.context = context; return this; } public BinaryStreamReader build() throws IOException { SettableUncompressedStream present = StreamUtils.createSettableUncompressedStream( OrcProto.Stream.Kind.PRESENT.name(), presentStream); SettableUncompressedStream data = StreamUtils.createSettableUncompressedStream( OrcProto.Stream.Kind.DATA.name(), dataStream); SettableUncompressedStream length = StreamUtils.createSettableUncompressedStream( OrcProto.Stream.Kind.LENGTH.name(), lengthStream); boolean isFileCompressed = compressionCodec != null; return new BinaryStreamReader(columnIndex, present, data, length, isFileCompressed, columnEncoding, context, vectors); } public StreamReaderBuilder setVectors(List<ColumnVector> vectors) { this.vectors = vectors; return this; } } public static StreamReaderBuilder builder() { return new StreamReaderBuilder(); } } protected static class BooleanStreamReader extends BooleanTreeReader implements SettableTreeReader { private boolean _isFileCompressed; private SettableUncompressedStream _presentStream; private SettableUncompressedStream _dataStream; private List<ColumnVector> vectors; private int vectorIndex = 0; private BooleanStreamReader(int columnId, SettableUncompressedStream present, SettableUncompressedStream data, boolean isFileCompressed, List<ColumnVector> vectors) throws IOException { super(columnId, present, data); this._isFileCompressed = isFileCompressed; this._presentStream = present; this._dataStream = data; this.vectors = vectors; } @Override public void seek(PositionProvider index) throws IOException { if (vectors != null) return; if (present != null) { if (_isFileCompressed) { index.getNext(); } present.seek(index); } // data stream could be empty stream or already reached end of stream before present stream. // This can happen if all values in stream are nulls or last row group values are all null. if (_dataStream.available() > 0) { if (_isFileCompressed) { index.getNext(); } reader.seek(index); } } @Override public void nextVector( ColumnVector previousVector, boolean[] isNull, int batchSize) throws IOException { if (vectors == null) { super.nextVector(previousVector, isNull, batchSize); return; } vectors.get(vectorIndex++).shallowCopyTo(previousVector); if (vectorIndex == vectors.size()) { vectors = null; } } @Override public void setBuffers(EncodedColumnBatch<OrcBatchKey> batch, boolean sameStripe) throws IOException { assert vectors == null; // See the comment in TimestampStreamReader.setBuffers. ColumnStreamData[] streamsData = batch.getColumnData(columnId); if (_presentStream != null) { _presentStream.setBuffers(StreamUtils.createDiskRangeInfo(streamsData[OrcProto.Stream.Kind.PRESENT_VALUE])); } if (_dataStream != null) { _dataStream.setBuffers(StreamUtils.createDiskRangeInfo(streamsData[OrcProto.Stream.Kind.DATA_VALUE])); } } public static class StreamReaderBuilder { private int columnIndex; private ColumnStreamData presentStream; private ColumnStreamData dataStream; private CompressionCodec compressionCodec; private List<ColumnVector> vectors; public StreamReaderBuilder setColumnIndex(int columnIndex) { this.columnIndex = columnIndex; return this; } public StreamReaderBuilder setPresentStream(ColumnStreamData presentStream) { this.presentStream = presentStream; return this; } public StreamReaderBuilder setDataStream(ColumnStreamData dataStream) { this.dataStream = dataStream; return this; } public StreamReaderBuilder setCompressionCodec(CompressionCodec compressionCodec) { this.compressionCodec = compressionCodec; return this; } public BooleanStreamReader build() throws IOException { SettableUncompressedStream present = StreamUtils .createSettableUncompressedStream(OrcProto.Stream.Kind.PRESENT.name(), presentStream); SettableUncompressedStream data = StreamUtils .createSettableUncompressedStream(OrcProto.Stream.Kind.DATA.name(), dataStream); boolean isFileCompressed = compressionCodec != null; return new BooleanStreamReader(columnIndex, present, data, isFileCompressed, vectors); } public StreamReaderBuilder setVectors(List<ColumnVector> vectors) { this.vectors = vectors; return this; } } public static StreamReaderBuilder builder() { return new StreamReaderBuilder(); } } public static StructTreeReader createRootTreeReader(TypeDescription schema, List<OrcProto.ColumnEncoding> encodings, OrcEncodedColumnBatch batch, CompressionCodec codec, TreeReaderFactory.Context context, int[] columnMapping) throws IOException { if (schema.getCategory() != Category.STRUCT) { throw new AssertionError("Schema is not a struct: " + schema); } // Some child types may be excluded. Note that this can only happen at root level. List<TypeDescription> children = schema.getChildren(); int childCount = children.size(), includedCount = 0; for (int childIx = 0; childIx < childCount; ++childIx) { int batchColIx = children.get(childIx).getId(); if (!batch.hasData(batchColIx) && !batch.hasVectors(batchColIx)) { if (LOG.isDebugEnabled()) { LOG.debug("Column at " + childIx + " " + children.get(childIx).getId() + ":" + children.get(childIx).toString() + " has no data"); } continue; } ++includedCount; } TreeReader[] childReaders = new TreeReader[includedCount]; for (int schemaChildIx = 0, inclChildIx = -1; schemaChildIx < childCount; ++schemaChildIx) { int batchColIx = children.get(schemaChildIx).getId(); if (!batch.hasData(batchColIx) && !batch.hasVectors(batchColIx)) continue; childReaders[++inclChildIx] = createEncodedTreeReader( schema.getChildren().get(schemaChildIx), encodings, batch, codec, context); columnMapping[inclChildIx] = schemaChildIx; } return StructStreamReader.builder() .setColumnIndex(0) .setCompressionCodec(codec) .setColumnEncoding(encodings.get(0)) .setChildReaders(childReaders) .setContext(context) .build(); } private static TreeReader createEncodedTreeReader(TypeDescription schema, List<OrcProto.ColumnEncoding> encodings, OrcEncodedColumnBatch batch, CompressionCodec codec, TreeReaderFactory.Context context) throws IOException { int columnIndex = schema.getId(); ColumnStreamData[] streamBuffers = null; List<ColumnVector> vectors = null; if (batch.hasData(columnIndex)) { streamBuffers = batch.getColumnData(columnIndex); } else if (batch.hasVectors(columnIndex)) { vectors = batch.getColumnVectors(columnIndex); } else { throw new AssertionError("Batch has no data for " + columnIndex + ": " + batch); } // EncodedColumnBatch is already decompressed, we don't really need to pass codec. // But we need to know if the original data is compressed or not. This is used to skip // positions in row index properly. If the file is originally compressed, // then 1st position (compressed offset) in row index should be skipped to get // uncompressed offset, else 1st position should not be skipped. // TODO: there should be a better way to do this, code just needs to be modified OrcProto.ColumnEncoding columnEncoding = encodings.get(columnIndex); // stream buffers are arranged in enum order of stream kind ColumnStreamData present = null, data = null, dictionary = null, lengths = null, secondary = null; if (streamBuffers != null) { present = streamBuffers[OrcProto.Stream.Kind.PRESENT_VALUE]; data = streamBuffers[OrcProto.Stream.Kind.DATA_VALUE]; dictionary = streamBuffers[OrcProto.Stream.Kind.DICTIONARY_DATA_VALUE]; lengths = streamBuffers[OrcProto.Stream.Kind.LENGTH_VALUE]; secondary = streamBuffers[OrcProto.Stream.Kind.SECONDARY_VALUE]; } if (LOG.isDebugEnabled()) { LOG.debug("columnIndex: {} columnType: {} streamBuffers.length: {} vectors: {} columnEncoding: {}" + " present: {} data: {} dictionary: {} lengths: {} secondary: {} tz: {}", columnIndex, schema, streamBuffers == null ? 0 : streamBuffers.length, vectors == null ? 0 : vectors.size(), columnEncoding, present != null, data, dictionary != null, lengths != null, secondary != null, context.getWriterTimezone()); } // TODO: get rid of the builders - they serve no purpose... just call ctors directly. switch (schema.getCategory()) { case BINARY: case BOOLEAN: case BYTE: case SHORT: case INT: case LONG: case FLOAT: case DOUBLE: case CHAR: case VARCHAR: case STRING: case DECIMAL: case TIMESTAMP: case DATE: return getPrimitiveTreeReader(columnIndex, schema, codec, columnEncoding, present, data, dictionary, lengths, secondary, context, vectors); case LIST: assert vectors == null; // Not currently supported. TypeDescription elementType = schema.getChildren().get(0); TreeReader elementReader = createEncodedTreeReader( elementType, encodings, batch, codec, context); return ListStreamReader.builder() .setColumnIndex(columnIndex) .setColumnEncoding(columnEncoding) .setCompressionCodec(codec) .setPresentStream(present) .setLengthStream(lengths) .setElementReader(elementReader) .setContext(context) .build(); case MAP: assert vectors == null; // Not currently supported. TypeDescription keyType = schema.getChildren().get(0); TypeDescription valueType = schema.getChildren().get(1); TreeReader keyReader = createEncodedTreeReader( keyType, encodings, batch, codec, context); TreeReader valueReader = createEncodedTreeReader( valueType, encodings, batch, codec, context); return MapStreamReader.builder() .setColumnIndex(columnIndex) .setColumnEncoding(columnEncoding) .setCompressionCodec(codec) .setPresentStream(present) .setLengthStream(lengths) .setKeyReader(keyReader) .setValueReader(valueReader) .setContext(context) .build(); case STRUCT: { assert vectors == null; // Not currently supported. int childCount = schema.getChildren().size(); TreeReader[] childReaders = new TreeReader[childCount]; for (int i = 0; i < childCount; i++) { TypeDescription childType = schema.getChildren().get(i); childReaders[i] = createEncodedTreeReader( childType, encodings, batch, codec, context); } return StructStreamReader.builder() .setColumnIndex(columnIndex) .setCompressionCodec(codec) .setColumnEncoding(columnEncoding) .setPresentStream(present) .setChildReaders(childReaders) .setContext(context) .build(); } case UNION: { assert vectors == null; // Not currently supported. int childCount = schema.getChildren().size(); TreeReader[] childReaders = new TreeReader[childCount]; for (int i = 0; i < childCount; i++) { TypeDescription childType = schema.getChildren().get(i); childReaders[i] = createEncodedTreeReader( childType, encodings, batch, codec, context); } return UnionStreamReader.builder() .setColumnIndex(columnIndex) .setCompressionCodec(codec) .setColumnEncoding(columnEncoding) .setPresentStream(present) .setDataStream(data) .setChildReaders(childReaders) .setContext(context) .build(); } default: throw new UnsupportedOperationException("Data type not supported: " + schema); } } private static TreeReader getPrimitiveTreeReader(final int columnIndex, TypeDescription columnType, CompressionCodec codec, OrcProto.ColumnEncoding columnEncoding, ColumnStreamData present, ColumnStreamData data, ColumnStreamData dictionary, ColumnStreamData lengths, ColumnStreamData secondary, TreeReaderFactory.Context context, List<ColumnVector> vectors) throws IOException { switch (columnType.getCategory()) { case BINARY: return BinaryStreamReader.builder() .setColumnIndex(columnIndex) .setPresentStream(present) .setDataStream(data) .setLengthStream(lengths) .setCompressionCodec(codec) .setColumnEncoding(columnEncoding) .setVectors(vectors) .setContext(context) .build(); case BOOLEAN: return BooleanStreamReader.builder() .setColumnIndex(columnIndex) .setPresentStream(present) .setDataStream(data) .setCompressionCodec(codec) .setVectors(vectors) .build(); case BYTE: return ByteStreamReader.builder() .setColumnIndex(columnIndex) .setPresentStream(present) .setDataStream(data) .setCompressionCodec(codec) .setVectors(vectors) .build(); case SHORT: return ShortStreamReader.builder() .setColumnIndex(columnIndex) .setPresentStream(present) .setDataStream(data) .setCompressionCodec(codec) .setColumnEncoding(columnEncoding) .setVectors(vectors) .setContext(context) .build(); case INT: return IntStreamReader.builder() .setColumnIndex(columnIndex) .setPresentStream(present) .setDataStream(data) .setCompressionCodec(codec) .setColumnEncoding(columnEncoding) .setVectors(vectors) .setContext(context) .build(); case LONG: return LongStreamReader.builder() .setColumnIndex(columnIndex) .setPresentStream(present) .setDataStream(data) .setCompressionCodec(codec) .setColumnEncoding(columnEncoding) .setVectors(vectors) .setContext(context) .build(); case FLOAT: return FloatStreamReader.builder() .setColumnIndex(columnIndex) .setPresentStream(present) .setDataStream(data) .setCompressionCodec(codec) .setVectors(vectors) .build(); case DOUBLE: return DoubleStreamReader.builder() .setColumnIndex(columnIndex) .setPresentStream(present) .setDataStream(data) .setCompressionCodec(codec) .setVectors(vectors) .build(); case CHAR: return CharStreamReader.builder() .setColumnIndex(columnIndex) .setMaxLength(columnType.getMaxLength()) .setPresentStream(present) .setDataStream(data) .setLengthStream(lengths) .setDictionaryStream(dictionary) .setCompressionCodec(codec) .setColumnEncoding(columnEncoding) .setVectors(vectors) .build(); case VARCHAR: return VarcharStreamReader.builder() .setColumnIndex(columnIndex) .setMaxLength(columnType.getMaxLength()) .setPresentStream(present) .setDataStream(data) .setLengthStream(lengths) .setDictionaryStream(dictionary) .setCompressionCodec(codec) .setColumnEncoding(columnEncoding) .setVectors(vectors) .build(); case STRING: return StringStreamReader.builder() .setColumnIndex(columnIndex) .setPresentStream(present) .setDataStream(data) .setLengthStream(lengths) .setDictionaryStream(dictionary) .setCompressionCodec(codec) .setColumnEncoding(columnEncoding) .setVectors(vectors) .build(); case DECIMAL: return DecimalStreamReader.builder() .setColumnIndex(columnIndex) .setPrecision(columnType.getPrecision()) .setScale(columnType.getScale()) .setPresentStream(present) .setValueStream(data) .setScaleStream(secondary) .setCompressionCodec(codec) .setColumnEncoding(columnEncoding) .setVectors(vectors) .setContext(context) .build(); case TIMESTAMP: return TimestampStreamReader.builder() .setColumnIndex(columnIndex) .setPresentStream(present) .setSecondsStream(data) .setNanosStream(secondary) .setCompressionCodec(codec) .setColumnEncoding(columnEncoding) .setVectors(vectors) .setContext(context) .build(); case DATE: return DateStreamReader.builder() .setColumnIndex(columnIndex) .setPresentStream(present) .setDataStream(data) .setCompressionCodec(codec) .setColumnEncoding(columnEncoding) .setVectors(vectors) .setContext(context) .build(); default: throw new AssertionError("Not a primitive category: " + columnType.getCategory()); } } protected static class ListStreamReader extends ListTreeReader implements SettableTreeReader { private boolean _isFileCompressed; private SettableUncompressedStream _presentStream; private SettableUncompressedStream _lengthStream; public ListStreamReader(final int columnIndex, final SettableUncompressedStream present, final SettableUncompressedStream lengthStream, final OrcProto.ColumnEncoding columnEncoding, final boolean isFileCompressed, final TreeReader elementReader, TreeReaderFactory.Context context) throws IOException { super(columnIndex, present, context, lengthStream, columnEncoding, elementReader); this._isFileCompressed = isFileCompressed; this._presentStream = present; this._lengthStream = lengthStream; } @Override public void seek(PositionProvider[] index) throws IOException { PositionProvider ownIndex = index[columnId]; if (present != null) { if (_isFileCompressed) { ownIndex.getNext(); } present.seek(ownIndex); } // lengths stream could be empty stream or already reached end of stream before present stream. // This can happen if all values in stream are nulls or last row group values are all null. if (_lengthStream.available() > 0) { if (_isFileCompressed) { ownIndex.getNext(); } lengths.seek(ownIndex); elementReader.seek(index); } } @Override public void seek(PositionProvider index) throws IOException { // Only our parent class can call this. throw new IOException("Should never be called"); } @Override public void setBuffers(EncodedColumnBatch<OrcBatchKey> batch, boolean sameStripe) throws IOException { ColumnStreamData[] streamsData = batch.getColumnData(columnId); if (_presentStream != null) { _presentStream.setBuffers( StreamUtils.createDiskRangeInfo(streamsData[OrcProto.Stream.Kind.PRESENT_VALUE])); } if (_lengthStream != null) { _lengthStream.setBuffers( StreamUtils.createDiskRangeInfo(streamsData[OrcProto.Stream.Kind.LENGTH_VALUE])); } if (elementReader != null) { ((SettableTreeReader) elementReader).setBuffers(batch, sameStripe); } } public static class StreamReaderBuilder { private int columnIndex; private ColumnStreamData presentStream; private ColumnStreamData lengthStream; private CompressionCodec compressionCodec; private OrcProto.ColumnEncoding columnEncoding; private TreeReader elementReader; private TreeReaderFactory.Context context; public ListStreamReader.StreamReaderBuilder setColumnIndex(int columnIndex) { this.columnIndex = columnIndex; return this; } public ListStreamReader.StreamReaderBuilder setLengthStream(ColumnStreamData lengthStream) { this.lengthStream = lengthStream; return this; } public ListStreamReader.StreamReaderBuilder setPresentStream(ColumnStreamData presentStream) { this.presentStream = presentStream; return this; } public ListStreamReader.StreamReaderBuilder setColumnEncoding(OrcProto.ColumnEncoding encoding) { this.columnEncoding = encoding; return this; } public ListStreamReader.StreamReaderBuilder setCompressionCodec(CompressionCodec compressionCodec) { this.compressionCodec = compressionCodec; return this; } public ListStreamReader.StreamReaderBuilder setElementReader(TreeReader elementReader) { this.elementReader = elementReader; return this; } public ListStreamReader.StreamReaderBuilder setContext(TreeReaderFactory.Context context) { this.context = context; return this; } public ListStreamReader build() throws IOException { SettableUncompressedStream present = StreamUtils .createSettableUncompressedStream(OrcProto.Stream.Kind.PRESENT.name(), presentStream); SettableUncompressedStream length = StreamUtils .createSettableUncompressedStream(OrcProto.Stream.Kind.LENGTH.name(), lengthStream); boolean isFileCompressed = compressionCodec != null; return new ListStreamReader(columnIndex, present, length, columnEncoding, isFileCompressed, elementReader, context); } } public static ListStreamReader.StreamReaderBuilder builder() { return new ListStreamReader.StreamReaderBuilder(); } } protected static class MapStreamReader extends MapTreeReader implements SettableTreeReader{ private boolean _isFileCompressed; private SettableUncompressedStream _presentStream; private SettableUncompressedStream _lengthStream; public MapStreamReader(final int columnIndex, final SettableUncompressedStream present, final SettableUncompressedStream lengthStream, final OrcProto.ColumnEncoding columnEncoding, final boolean isFileCompressed, final TreeReader keyReader, final TreeReader valueReader, TreeReaderFactory.Context context) throws IOException { super(columnIndex, present, context, lengthStream, columnEncoding, keyReader, valueReader); this._isFileCompressed = isFileCompressed; this._presentStream = present; this._lengthStream = lengthStream; } @Override public void seek(PositionProvider[] index) throws IOException { // We are not calling super.seek since we handle the present stream differently. PositionProvider ownIndex = index[columnId]; if (present != null) { if (_isFileCompressed) { ownIndex.getNext(); } present.seek(ownIndex); } // lengths stream could be empty stream or already reached end of stream before present stream. // This can happen if all values in stream are nulls or last row group values are all null. if (_lengthStream.available() > 0) { if (_isFileCompressed) { ownIndex.getNext(); } lengths.seek(ownIndex); keyReader.seek(index); valueReader.seek(index); } } @Override public void seek(PositionProvider index) throws IOException { // Only our parent class can call this. throw new IOException("Should never be called"); } @Override public void setBuffers(EncodedColumnBatch<OrcBatchKey> batch, boolean sameStripe) throws IOException { ColumnStreamData[] streamsData = batch.getColumnData(columnId); if (_presentStream != null) { _presentStream.setBuffers( StreamUtils.createDiskRangeInfo(streamsData[OrcProto.Stream.Kind.PRESENT_VALUE])); } if (_lengthStream != null) { _lengthStream.setBuffers( StreamUtils.createDiskRangeInfo(streamsData[OrcProto.Stream.Kind.LENGTH_VALUE])); } if (keyReader != null) { ((SettableTreeReader) keyReader).setBuffers(batch, sameStripe); } if (valueReader != null) { ((SettableTreeReader) valueReader).setBuffers(batch, sameStripe); } } public static class StreamReaderBuilder { private int columnIndex; private ColumnStreamData presentStream; private ColumnStreamData lengthStream; private CompressionCodec compressionCodec; private OrcProto.ColumnEncoding columnEncoding; private TreeReader keyReader; private TreeReader valueReader; private TreeReaderFactory.Context context; public MapStreamReader.StreamReaderBuilder setColumnIndex(int columnIndex) { this.columnIndex = columnIndex; return this; } public MapStreamReader.StreamReaderBuilder setLengthStream(ColumnStreamData lengthStream) { this.lengthStream = lengthStream; return this; } public MapStreamReader.StreamReaderBuilder setPresentStream(ColumnStreamData presentStream) { this.presentStream = presentStream; return this; } public MapStreamReader.StreamReaderBuilder setColumnEncoding(OrcProto.ColumnEncoding encoding) { this.columnEncoding = encoding; return this; } public MapStreamReader.StreamReaderBuilder setCompressionCodec(CompressionCodec compressionCodec) { this.compressionCodec = compressionCodec; return this; } public MapStreamReader.StreamReaderBuilder setKeyReader(TreeReader keyReader) { this.keyReader = keyReader; return this; } public MapStreamReader.StreamReaderBuilder setValueReader(TreeReader valueReader) { this.valueReader = valueReader; return this; } public MapStreamReader.StreamReaderBuilder setContext(TreeReaderFactory.Context context) { this.context = context; return this; } public MapStreamReader build() throws IOException { SettableUncompressedStream present = StreamUtils .createSettableUncompressedStream(OrcProto.Stream.Kind.PRESENT.name(), presentStream); SettableUncompressedStream length = StreamUtils .createSettableUncompressedStream(OrcProto.Stream.Kind.LENGTH.name(), lengthStream); boolean isFileCompressed = compressionCodec != null; return new MapStreamReader(columnIndex, present, length, columnEncoding, isFileCompressed, keyReader, valueReader, context); } } public static MapStreamReader.StreamReaderBuilder builder() { return new MapStreamReader.StreamReaderBuilder(); } } protected static class StructStreamReader extends StructTreeReader implements SettableTreeReader { private boolean _isFileCompressed; private SettableUncompressedStream _presentStream; public StructStreamReader(final int columnIndex, final SettableUncompressedStream present, final OrcProto.ColumnEncoding columnEncoding, final boolean isFileCompressed, final TreeReader[] childReaders, TreeReaderFactory.Context context) throws IOException { super(columnIndex, present, context, columnEncoding, childReaders); this._isFileCompressed = isFileCompressed; this._presentStream = present; } @Override public void seek(PositionProvider[] index) throws IOException { PositionProvider ownIndex = index[columnId]; if (present != null) { if (_isFileCompressed) { ownIndex.getNext(); } present.seek(ownIndex); } if (fields != null) { for (TreeReader child : fields) { child.seek(index); } } } @Override public void seek(PositionProvider index) throws IOException { // Only our parent class can call this. throw new IOException("Should never be called"); } @Override public void setBuffers(EncodedColumnBatch<OrcBatchKey> batch, boolean sameStripe) throws IOException { ColumnStreamData[] streamsData = batch.getColumnData(columnId); if (_presentStream != null) { _presentStream.setBuffers( StreamUtils.createDiskRangeInfo(streamsData[OrcProto.Stream.Kind.PRESENT_VALUE])); } if (fields != null) { for (TreeReader child : fields) { ((SettableTreeReader) child).setBuffers(batch, sameStripe); } } } public static class StreamReaderBuilder { private int columnIndex; private ColumnStreamData presentStream; private CompressionCodec compressionCodec; private OrcProto.ColumnEncoding columnEncoding; private TreeReader[] childReaders; private TreeReaderFactory.Context context; public StructStreamReader.StreamReaderBuilder setColumnIndex(int columnIndex) { this.columnIndex = columnIndex; return this; } public StructStreamReader.StreamReaderBuilder setPresentStream(ColumnStreamData presentStream) { this.presentStream = presentStream; return this; } public StructStreamReader.StreamReaderBuilder setColumnEncoding(OrcProto.ColumnEncoding encoding) { this.columnEncoding = encoding; return this; } public StructStreamReader.StreamReaderBuilder setCompressionCodec(CompressionCodec compressionCodec) { this.compressionCodec = compressionCodec; return this; } public StructStreamReader.StreamReaderBuilder setChildReaders(TreeReader[] childReaders) { this.childReaders = childReaders; return this; } public StructStreamReader.StreamReaderBuilder setContext(TreeReaderFactory.Context context) { this.context = context; return this; } public StructStreamReader build() throws IOException { SettableUncompressedStream present = StreamUtils .createSettableUncompressedStream(OrcProto.Stream.Kind.PRESENT.name(), presentStream); boolean isFileCompressed = compressionCodec != null; return new StructStreamReader(columnIndex, present, columnEncoding, isFileCompressed, childReaders, context); } } public static StructStreamReader.StreamReaderBuilder builder() { return new StructStreamReader.StreamReaderBuilder(); } } protected static class UnionStreamReader extends UnionTreeReader implements SettableTreeReader { private boolean _isFileCompressed; private SettableUncompressedStream _presentStream; private SettableUncompressedStream _dataStream; public UnionStreamReader(final int columnIndex, final SettableUncompressedStream present, final SettableUncompressedStream dataStream, final OrcProto.ColumnEncoding columnEncoding, final boolean isFileCompressed, final TreeReader[] childReaders, TreeReaderFactory.Context context) throws IOException { super(columnIndex, present, context, columnEncoding, childReaders); this._isFileCompressed = isFileCompressed; this._presentStream = present; this._dataStream = dataStream; // Note: other parent readers init everything in ctor, but union does it in startStripe. this.tags = new RunLengthByteReader(dataStream); } @Override public void seek(PositionProvider[] index) throws IOException { PositionProvider ownIndex = index[columnId]; if (present != null) { if (_isFileCompressed) { ownIndex.getNext(); } present.seek(ownIndex); } // lengths stream could be empty stream or already reached end of stream before present stream. // This can happen if all values in stream are nulls or last row group values are all null. if (_dataStream.available() > 0) { if (_isFileCompressed) { ownIndex.getNext(); } tags.seek(ownIndex); if (fields != null) { for (TreeReader child : fields) { child.seek(index); } } } } @Override public void seek(PositionProvider index) throws IOException { // Only our parent class can call this. throw new IOException("Should never be called"); } @Override public void setBuffers(EncodedColumnBatch<OrcBatchKey> batch, boolean sameStripe) throws IOException { ColumnStreamData[] streamsData = batch.getColumnData(columnId); if (_presentStream != null) { _presentStream.setBuffers( StreamUtils.createDiskRangeInfo(streamsData[OrcProto.Stream.Kind.PRESENT_VALUE])); } if (_dataStream != null) { _dataStream.setBuffers( StreamUtils.createDiskRangeInfo(streamsData[OrcProto.Stream.Kind.DATA_VALUE])); } if (fields != null) { for (TreeReader child : fields) { ((SettableTreeReader) child).setBuffers(batch, sameStripe); } } } public static class StreamReaderBuilder { private int columnIndex; private ColumnStreamData presentStream; private ColumnStreamData dataStream; private CompressionCodec compressionCodec; private OrcProto.ColumnEncoding columnEncoding; private TreeReader[] childReaders; private TreeReaderFactory.Context context; public UnionStreamReader.StreamReaderBuilder setColumnIndex(int columnIndex) { this.columnIndex = columnIndex; return this; } public UnionStreamReader.StreamReaderBuilder setDataStream(ColumnStreamData dataStream) { this.dataStream = dataStream; return this; } public UnionStreamReader.StreamReaderBuilder setPresentStream(ColumnStreamData presentStream) { this.presentStream = presentStream; return this; } public UnionStreamReader.StreamReaderBuilder setColumnEncoding(OrcProto.ColumnEncoding encoding) { this.columnEncoding = encoding; return this; } public UnionStreamReader.StreamReaderBuilder setCompressionCodec(CompressionCodec compressionCodec) { this.compressionCodec = compressionCodec; return this; } public UnionStreamReader.StreamReaderBuilder setChildReaders(TreeReader[] childReaders) { this.childReaders = childReaders; return this; } public UnionStreamReader.StreamReaderBuilder setContext(TreeReaderFactory.Context context) { this.context = context; return this; } public UnionStreamReader build() throws IOException { SettableUncompressedStream present = StreamUtils.createSettableUncompressedStream( OrcProto.Stream.Kind.PRESENT.name(), presentStream); SettableUncompressedStream data = StreamUtils.createSettableUncompressedStream( OrcProto.Stream.Kind.DATA.name(), dataStream); boolean isFileCompressed = compressionCodec != null; return new UnionStreamReader(columnIndex, present, data, columnEncoding, isFileCompressed, childReaders, context); } } public static UnionStreamReader.StreamReaderBuilder builder() { return new UnionStreamReader.StreamReaderBuilder(); } } }