/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.io.orc.encoded; import java.io.IOException; import java.util.Arrays; import java.util.List; import org.apache.hadoop.hive.common.Pool; import org.apache.hadoop.hive.common.Pool.PoolObjectHelper; import org.apache.hadoop.hive.common.io.DataCache; import org.apache.hadoop.hive.common.io.encoded.EncodedColumnBatch; import org.apache.hadoop.hive.common.io.encoded.EncodedColumnBatch.ColumnStreamData; import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; import org.apache.orc.DataReader; import org.apache.orc.OrcProto; /** * The interface for reading encoded data from ORC files. */ public interface Reader extends org.apache.hadoop.hive.ql.io.orc.Reader { /** * Creates the encoded reader. * @param fileKey File ID to read, to use for cache lookups and such. * @param dataCache Data cache to use for cache lookups. * @param dataReader Data reader to read data not found in cache (from disk, HDFS, and such). * @param pf Pool factory to create object pools. * @return The reader. */ EncodedReader encodedReader(Object fileKey, DataCache dataCache, DataReader dataReader, PoolFactory pf) throws IOException; /** The factory that can create (or return) the pools used by encoded reader. */ public interface PoolFactory { <T> Pool<T> createPool(int size, PoolObjectHelper<T> helper); Pool<OrcEncodedColumnBatch> createEncodedColumnBatchPool(); Pool<ColumnStreamData> createColumnStreamDataPool(); } /** Implementation of EncodedColumnBatch for ORC. */ public static final class OrcEncodedColumnBatch extends EncodedColumnBatch<OrcBatchKey> { /** RG index indicating the data applies for all RGs (e.g. a string dictionary). */ public static final int ALL_RGS = -1; /** * All the previous streams are data streams, this and the next ones are index streams. * We assume the order will stay the same for backward compat. */ public static final int MAX_DATA_STREAMS = OrcProto.Stream.Kind.ROW_INDEX.getNumber(); public void init(Object fileKey, int stripeIx, int rgIx, int columnCount) { if (batchKey == null) { batchKey = new OrcBatchKey(fileKey, stripeIx, rgIx); } else { batchKey.set(fileKey, stripeIx, rgIx); } resetColumnArrays(columnCount); } public void initOrcColumn(int colIx) { super.initColumn(colIx, MAX_DATA_STREAMS); } /** * Same as columnData, but for the data that already comes as VRBs. * The combination of the two contains all the necessary data, */ protected List<ColumnVector>[] columnVectors; @Override public void reset() { super.reset(); if (columnVectors == null) return; Arrays.fill(columnVectors, null); } @SuppressWarnings("unchecked") public void initColumnWithVectors(int colIx, List<ColumnVector> data) { if (columnVectors == null) { columnVectors = new List[columnData.length]; } columnVectors[colIx] = data; } @SuppressWarnings("unchecked") @Override protected void resetColumnArrays(int columnCount) { super.resetColumnArrays(columnCount); if (columnVectors != null && columnCount == columnVectors.length) { Arrays.fill(columnVectors, null); return; } if (columnVectors != null) { columnVectors = new List[columnCount]; } else { columnVectors = null; } } public boolean hasVectors(int colIx) { return columnVectors != null && columnVectors[colIx] != null; } public List<ColumnVector> getColumnVectors(int colIx) { if (!hasVectors(colIx)) throw new AssertionError("No data for column " + colIx); return columnVectors[colIx]; } } }