OrcEncodedDataReader.java example

Explorer
hive-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.llap.io.encoded;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.security.PrivilegedExceptionAction;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;

import org.apache.hadoop.hive.llap.counters.LlapIOCounters;
import org.apache.orc.TypeDescription;
import org.apache.orc.impl.DataReaderProperties;
import org.apache.orc.impl.OrcIndex;
import org.apache.orc.impl.OrcTail;
import org.apache.orc.impl.SchemaEvolution;
import org.apache.tez.common.counters.TezCounters;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.Pool;
import org.apache.hadoop.hive.common.Pool.PoolObjectHelper;
import org.apache.hadoop.hive.common.io.DataCache;
import org.apache.hadoop.hive.common.io.Allocator;
import org.apache.hadoop.hive.common.io.encoded.EncodedColumnBatch.ColumnStreamData;
import org.apache.hadoop.hive.common.io.DiskRange;
import org.apache.hadoop.hive.common.io.DiskRangeList;
import org.apache.hadoop.hive.common.io.encoded.MemoryBuffer;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
import org.apache.hadoop.hive.llap.ConsumerFeedback;
import org.apache.hadoop.hive.llap.DebugUtils;
import org.apache.hadoop.hive.llap.cache.BufferUsageManager;
import org.apache.hadoop.hive.llap.cache.LowLevelCache;
import org.apache.hadoop.hive.llap.cache.LowLevelCache.Priority;
import org.apache.hadoop.hive.llap.counters.QueryFragmentCounters;
import org.apache.hadoop.hive.llap.io.api.impl.LlapIoImpl;
import org.apache.hadoop.hive.llap.io.decode.OrcEncodedDataConsumer;
import org.apache.hadoop.hive.llap.io.metadata.OrcFileMetadata;
import org.apache.hadoop.hive.llap.io.metadata.OrcMetadataCache;
import org.apache.hadoop.hive.llap.io.metadata.OrcStripeMetadata;
import org.apache.hadoop.hive.ql.exec.DDLTask;
import org.apache.hadoop.hive.ql.io.AcidUtils;
import org.apache.hadoop.hive.ql.io.HdfsUtils;
import org.apache.orc.CompressionKind;
import org.apache.orc.DataReader;
import org.apache.hadoop.hive.ql.io.orc.OrcFile;
import org.apache.hadoop.hive.ql.io.orc.OrcFile.ReaderOptions;
import org.apache.orc.OrcConf;
import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat;
import org.apache.hadoop.hive.ql.io.orc.OrcSplit;
import org.apache.hadoop.hive.ql.io.orc.encoded.Reader;
import org.apache.hadoop.hive.ql.io.orc.RecordReaderImpl;
import org.apache.hadoop.hive.ql.io.orc.encoded.EncodedOrcFile;
import org.apache.hadoop.hive.ql.io.orc.encoded.EncodedReader;
import org.apache.hadoop.hive.ql.io.orc.encoded.OrcBatchKey;
import org.apache.hadoop.hive.ql.io.orc.encoded.Reader.OrcEncodedColumnBatch;
import org.apache.hadoop.hive.ql.io.orc.encoded.Reader.PoolFactory;
import org.apache.orc.impl.RecordReaderUtils;
import org.apache.orc.StripeInformation;
import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hive.common.util.FixedSizedObjectPool;
import org.apache.orc.OrcProto;
import org.apache.tez.common.CallableWithNdc;

/**
 * This produces EncodedColumnBatch via ORC EncodedDataImpl.
 * It serves as Consumer for EncodedColumnBatch too, for the high-level cache scenario where
 * it inserts itself into the pipeline to put the data in cache, before passing it to the real
 * consumer. It also serves as ConsumerFeedback that receives processed EncodedColumnBatch-es.
 */
public class OrcEncodedDataReader extends CallableWithNdc<Void>
    implements ConsumerFeedback<OrcEncodedColumnBatch>, TezCounterSource {
  private static final Logger LOG = LoggerFactory.getLogger(OrcEncodedDataReader.class);
  public static final FixedSizedObjectPool<ColumnStreamData> CSD_POOL =
      new FixedSizedObjectPool<>(8192, new PoolObjectHelper<ColumnStreamData>() {
        @Override
        public ColumnStreamData create() {
          return new ColumnStreamData();
        }
        @Override
        public void resetBeforeOffer(ColumnStreamData t) {
          t.reset();
        }
      });
  public static final FixedSizedObjectPool<OrcEncodedColumnBatch> ECB_POOL =
      new FixedSizedObjectPool<>(1024, new PoolObjectHelper<OrcEncodedColumnBatch>() {
        @Override
        public OrcEncodedColumnBatch create() {
          return new OrcEncodedColumnBatch();
        }
        @Override
        public void resetBeforeOffer(OrcEncodedColumnBatch t) {
          t.reset();
        }
      });
  private static final PoolFactory POOL_FACTORY = new PoolFactory() {
    @Override
    public <T> Pool<T> createPool(int size, PoolObjectHelper<T> helper) {
      return new FixedSizedObjectPool<>(size, helper);
    }

    @Override
    public Pool<ColumnStreamData> createColumnStreamDataPool() {
      return CSD_POOL;
    }

    @Override
    public Pool<OrcEncodedColumnBatch> createEncodedColumnBatchPool() {
      return ECB_POOL;
    }
  };

  private final OrcMetadataCache metadataCache;
  private final LowLevelCache lowLevelCache;
  private final BufferUsageManager bufferManager;
  private final Configuration daemonConf, jobConf;
  private final FileSplit split;
  private List<Integer> includedColumnIds;
  private final SearchArgument sarg;
  private final String[] columnNames;
  private final OrcEncodedDataConsumer consumer;
  private final QueryFragmentCounters counters;
  private final UserGroupInformation ugi;
  private final SchemaEvolution evolution;

  // Read state.
  private int stripeIxFrom;
  private OrcFileMetadata fileMetadata;
  private Path path;
  private Reader orcReader;
  private DataReader metadataReader;
  private EncodedReader stripeReader;
  private Object fileKey;
  private FileSystem fs;
  /**
   * readState[stripeIx'][colIx'] => boolean array (could be a bitmask) of rg-s that need to be
   * read. Contains only stripes that are read, and only columns included. null => read all RGs.
   */
  private boolean[][][] readState;
  private volatile boolean isStopped = false;
  @SuppressWarnings("unused")
  private volatile boolean isPaused = false;

  boolean[] globalIncludes = null;

  public OrcEncodedDataReader(LowLevelCache lowLevelCache, BufferUsageManager bufferManager,
      OrcMetadataCache metadataCache, Configuration daemonConf, Configuration jobConf,
      FileSplit split, List<Integer> columnIds, SearchArgument sarg, String[] columnNames,
      OrcEncodedDataConsumer consumer, QueryFragmentCounters counters,
      TypeDescription readerSchema) throws IOException {
    this.lowLevelCache = lowLevelCache;
    this.metadataCache = metadataCache;
    this.bufferManager = bufferManager;
    this.daemonConf = daemonConf;
    this.split = split;
    this.includedColumnIds = columnIds;
    if (this.includedColumnIds != null) {
      Collections.sort(this.includedColumnIds);
    }
    this.sarg = sarg;
    this.columnNames = columnNames;
    this.consumer = consumer;
    this.counters = counters;
    try {
      this.ugi = UserGroupInformation.getCurrentUser();
    } catch (IOException e) {
      throw new RuntimeException(e);
    }

    // moved this part of code from performDataRead as LlapInputFormat need to know the file schema
    // to decide if schema evolution is supported or not
    orcReader = null;
    // 1. Get file metadata from cache, or create the reader and read it.
    // Don't cache the filesystem object for now; Tez closes it and FS cache will fix all that
    fs = split.getPath().getFileSystem(jobConf);
    fileKey = determineFileId(fs, split,
        HiveConf.getBoolVar(daemonConf, ConfVars.LLAP_CACHE_ALLOW_SYNTHETIC_FILEID));
    fileMetadata = getOrReadFileMetadata();
    if (readerSchema == null) {
      readerSchema = fileMetadata.getSchema();
    }
    globalIncludes = OrcInputFormat.genIncludedColumns(readerSchema, includedColumnIds);
    // Do not allow users to override zero-copy setting. The rest can be taken from user config.
    boolean useZeroCopy = OrcConf.USE_ZEROCOPY.getBoolean(daemonConf);
    if (useZeroCopy != OrcConf.USE_ZEROCOPY.getBoolean(jobConf)) {
      jobConf = new Configuration(jobConf);
      jobConf.setBoolean(OrcConf.USE_ZEROCOPY.getAttribute(), useZeroCopy);
    }
    this.jobConf = jobConf;
    Reader.Options options = new Reader.Options(jobConf).include(globalIncludes);
    evolution = new SchemaEvolution(fileMetadata.getSchema(), readerSchema, options);
    consumer.setFileMetadata(fileMetadata);
    consumer.setIncludedColumns(globalIncludes);
    consumer.setSchemaEvolution(evolution);
  }

  @Override
  public void stop() {
    LOG.debug("Encoded reader is being stopped");
    isStopped = true;
  }

  @Override
  public void pause() {
    isPaused = true;
    // TODO: pause fetching
  }

  @Override
  public void unpause() {
    isPaused = false;
    // TODO: unpause fetching
  }

  @Override
  protected Void callInternal() throws IOException, InterruptedException {
    return ugi.doAs(new PrivilegedExceptionAction<Void>() {
      @Override
      public Void run() throws Exception {
        return performDataRead();
      }
    });
  }

  protected Void performDataRead() throws IOException {
    long startTime = counters.startTimeCounter();
    LlapIoImpl.LOG.info("Processing data for {}", split.getPath());
    if (processStop()) {
      recordReaderTime(startTime);
      return null;
    }
    counters.setDesc(QueryFragmentCounters.Desc.TABLE, getDbAndTableName(split.getPath()));
    counters.setDesc(QueryFragmentCounters.Desc.FILE, split.getPath()
        + (fileKey == null ? "" : " (" + fileKey + ")"));
    try {
      validateFileMetadata();
      if (includedColumnIds == null) {
        includedColumnIds = getAllColumnIds(fileMetadata);
      }

      // 2. Determine which stripes to read based on the split.
      determineStripesToRead();
    } catch (Throwable t) {
      recordReaderTime(startTime);
      consumer.setError(t);
      return null;
    }

    if (readState.length == 0) {
      consumer.setDone();
      recordReaderTime(startTime);
      return null; // No data to read.
    }
    counters.setDesc(QueryFragmentCounters.Desc.STRIPES, stripeIxFrom + "," + readState.length);

    // 3. Apply SARG if needed, and otherwise determine what RGs to read.
    int stride = fileMetadata.getRowIndexStride();
    ArrayList<OrcStripeMetadata> stripeMetadatas = null;
    boolean[] sargColumns = null;
    try {
      if (sarg != null && stride != 0) {
        // TODO: move this to a common method
        int[] filterColumns = RecordReaderImpl.mapSargColumnsToOrcInternalColIdx(
          sarg.getLeaves(), evolution);
        // included will not be null, row options will fill the array with trues if null
        sargColumns = new boolean[globalIncludes.length];
        for (int i : filterColumns) {
          // filter columns may have -1 as index which could be partition column in SARG.
          if (i > 0) {
            sargColumns[i] = true;
          }
        }

        // If SARG is present, get relevant stripe metadata from cache or readers.
        stripeMetadatas = readStripesMetadata(globalIncludes, sargColumns);
      }

      // Now, apply SARG if any; w/o sarg, this will just initialize readState.
      boolean hasData = determineRgsToRead(globalIncludes, stride, stripeMetadatas);
      if (!hasData) {
        consumer.setDone();
        recordReaderTime(startTime);
        return null; // No data to read.
      }
    } catch (Throwable t) {
      cleanupReaders();
      consumer.setError(t);
      recordReaderTime(startTime);
      return null;
    }

    if (processStop()) {
      recordReaderTime(startTime);
      return null;
    }

    // 4. Create encoded data reader.
    try {
      ensureOrcReader();
      // Reader creating updates HDFS counters, don't do it here.
      DataWrapperForOrc dw = new DataWrapperForOrc();
      stripeReader = orcReader.encodedReader(fileKey, dw, dw, POOL_FACTORY);
      stripeReader.setTracing(LlapIoImpl.ORC_LOGGER.isTraceEnabled());
    } catch (Throwable t) {
      consumer.setError(t);
      recordReaderTime(startTime);
      cleanupReaders();
      return null;
    }

    // 6. Read data.
    // TODO: I/O threadpool could be here - one thread per stripe; for now, linear.
    boolean hasFileId = this.fileKey != null;
    OrcBatchKey stripeKey = hasFileId ? new OrcBatchKey(fileKey, -1, 0) : null;
    for (int stripeIxMod = 0; stripeIxMod < readState.length; ++stripeIxMod) {
      if (processStop()) {
        recordReaderTime(startTime);
        return null;
      }
      int stripeIx = stripeIxFrom + stripeIxMod;
      boolean[][] colRgs = null;
      OrcStripeMetadata stripeMetadata = null;
      StripeInformation stripe;
      try {
        stripe = fileMetadata.getStripes().get(stripeIx);

        LlapIoImpl.ORC_LOGGER.trace("Reading stripe {}: {}, {}", stripeIx, stripe.getOffset(),
            stripe.getLength());
        colRgs = readState[stripeIxMod];
        if (LlapIoImpl.ORC_LOGGER.isTraceEnabled()) {
          LlapIoImpl.ORC_LOGGER.trace("readState[{}]: {}", stripeIxMod, Arrays.toString(colRgs));
        }
        // We assume that NO_RGS value is only set from SARG filter and for all columns;
        // intermediate changes for individual columns will unset values in the array.
        // Skip this case for 0-column read. We could probably special-case it just like we do
        // in EncodedReaderImpl, but for now it's not that important.
        if (colRgs.length > 0 && colRgs[0] ==
            RecordReaderImpl.SargApplier.READ_NO_RGS) continue;

        // 6.2. Ensure we have stripe metadata. We might have read it before for RG filtering.
        boolean isFoundInCache = false;
        if (stripeMetadatas != null) {
          stripeMetadata = stripeMetadatas.get(stripeIxMod);
        } else {
          if (hasFileId && metadataCache != null) {
            stripeKey.stripeIx = stripeIx;
            stripeMetadata = metadataCache.getStripeMetadata(stripeKey);
          }
          isFoundInCache = (stripeMetadata != null);
          if (!isFoundInCache) {
            counters.incrCounter(LlapIOCounters.METADATA_CACHE_MISS);
            ensureMetadataReader();
            long startTimeHdfs = counters.startTimeCounter();
            stripeMetadata = new OrcStripeMetadata(new OrcBatchKey(fileKey, stripeIx, 0),
                metadataReader, stripe, globalIncludes, sargColumns,
                orcReader.getSchema(), orcReader.getWriterVersion());
            counters.incrTimeCounter(LlapIOCounters.HDFS_TIME_NS, startTimeHdfs);
            if (hasFileId && metadataCache != null) {
              stripeMetadata = metadataCache.putStripeMetadata(stripeMetadata);
              if (LlapIoImpl.ORC_LOGGER.isTraceEnabled()) {
                LlapIoImpl.ORC_LOGGER.trace("Caching stripe {} metadata with includes: {}",
                    stripeKey.stripeIx, DebugUtils.toString(globalIncludes));
              }
            }
          }
          consumer.setStripeMetadata(stripeMetadata);
        }
        if (!stripeMetadata.hasAllIndexes(globalIncludes)) {
          if (LlapIoImpl.ORC_LOGGER.isTraceEnabled()) {
            LlapIoImpl.ORC_LOGGER.trace("Updating indexes in stripe {} metadata for includes: {}",
                stripeKey.stripeIx, DebugUtils.toString(globalIncludes));
          }
          assert isFoundInCache;
          counters.incrCounter(LlapIOCounters.METADATA_CACHE_MISS);
          ensureMetadataReader();
          updateLoadedIndexes(stripeMetadata, stripe, globalIncludes, sargColumns);
        } else if (isFoundInCache) {
          counters.incrCounter(LlapIOCounters.METADATA_CACHE_HIT);
        }
      } catch (Throwable t) {
        consumer.setError(t);
        cleanupReaders();
        recordReaderTime(startTime);
        return null;
      }
      if (processStop()) {
        recordReaderTime(startTime);
        return null;
      }

      // 5.2. Finally, hand off to the stripe reader to produce the data.
      //      This is a sync call that will feed data to the consumer.
      try {
        // TODO: readEncodedColumns is not supposed to throw; errors should be propagated thru
        // consumer. It is potentially holding locked buffers, and must perform its own cleanup.
        // Also, currently readEncodedColumns is not stoppable. The consumer will discard the
        // data it receives for one stripe. We could probably interrupt it, if it checked that.
        stripeReader.readEncodedColumns(stripeIx, stripe, stripeMetadata.getRowIndexes(),
            stripeMetadata.getEncodings(), stripeMetadata.getStreams(), globalIncludes,
            colRgs, consumer);
      } catch (Throwable t) {
        consumer.setError(t);
        cleanupReaders();
        recordReaderTime(startTime);
        return null;
      }
    }

    // Done with all the things.
    recordReaderTime(startTime);
    consumer.setDone();

    LlapIoImpl.LOG.trace("done processing {}", split);

    // Close the stripe reader, we are done reading.
    cleanupReaders();
    return null;
  }

  private void recordReaderTime(long startTime) {
    counters.incrTimeCounter(LlapIOCounters.TOTAL_IO_TIME_NS, startTime);
  }

  private static String getDbAndTableName(Path path) {
    // Ideally, we'd get this from split; however, split doesn't contain any such thing and it's
    // actually pretty hard to get cause even split generator only uses paths. We only need this
    // for metrics; therefore, brace for BLACK MAGIC!
    String[] parts = path.toUri().getPath().toString().split(Path.SEPARATOR);
    int dbIx = -1;
    // Try to find the default db postfix; don't check two last components - at least there
    // should be a table and file (we could also try to throw away partition/bucket/acid stuff).
    for (int i = 0; i < parts.length - 2; ++i) {
      if (!parts[i].endsWith(DDLTask.DATABASE_PATH_SUFFIX)) continue;
      if (dbIx >= 0) {
        dbIx = -1; // Let's not guess.
        break;
      }
      dbIx = i;
    }
    if (dbIx >= 0) {
      return parts[dbIx].substring(0, parts[dbIx].length() - 3) + "." + parts[dbIx + 1];
    }

    // Just go from the back and throw away everything we think is wrong; skip last item, the file.
    boolean isInPartFields = false;
    for (int i = parts.length - 2; i >= 0; --i) {
      String p = parts[i];
      boolean isPartField = p.contains("=");
      if ((isInPartFields && !isPartField) || (!isPartField && !p.startsWith(AcidUtils.BASE_PREFIX)
          && !p.startsWith(AcidUtils.DELTA_PREFIX) && !p.startsWith(AcidUtils.BUCKET_PREFIX))) {
        dbIx = i - 1;
        break;
      }
      isInPartFields = isPartField;
    }
    // If we found something before we ran out of components, use it.
    if (dbIx >= 0) {
      String dbName = parts[dbIx];
      if (dbName.endsWith(DDLTask.DATABASE_PATH_SUFFIX)) {
        dbName = dbName.substring(0, dbName.length() - 3);
      }
      return dbName + "." + parts[dbIx + 1];
    }
    return "unknown";
  }

  private void validateFileMetadata() throws IOException {
    if (fileMetadata.getCompressionKind() == CompressionKind.NONE) return;
    int bufferSize = fileMetadata.getCompressionBufferSize();
    long minAllocSize = HiveConf.getSizeVar(daemonConf, ConfVars.LLAP_ALLOCATOR_MIN_ALLOC);
    if (bufferSize < minAllocSize) {
      LOG.warn("ORC compression buffer size (" + bufferSize + ") is smaller than LLAP low-level "
            + "cache minimum allocation size (" + minAllocSize + "). Decrease the value for "
            + HiveConf.ConfVars.LLAP_ALLOCATOR_MIN_ALLOC.toString() + " to avoid wasting memory");
    }
  }

  private boolean processStop() {
    if (!isStopped) return false;
    LOG.info("Encoded data reader is stopping");
    cleanupReaders();
    return true;
  }

  private static Object determineFileId(FileSystem fs, FileSplit split,
      boolean allowSynthetic) throws IOException {
    if (split instanceof OrcSplit) {
      Object fileKey = ((OrcSplit)split).getFileKey();
      if (fileKey != null) {
        return fileKey;
      }
    }
    LOG.warn("Split for " + split.getPath() + " (" + split.getClass() + ") does not have file ID");
    return HdfsUtils.getFileId(fs, split.getPath(), allowSynthetic);
  }

  /**
   * Puts all column indexes from metadata to make a column list to read all column.
   */
  private static List<Integer> getAllColumnIds(OrcFileMetadata metadata) {
    int rootColumn = OrcInputFormat.getRootColumn(true);
    List<Integer> types = metadata.getTypes().get(rootColumn).getSubtypesList();
    List<Integer> columnIds = new ArrayList<Integer>(types.size());
    for (int i = 0; i < types.size(); ++i) {
      columnIds.add(i);
    }
    return columnIds;
  }

  /**
   * In case if stripe metadata in cache does not have all indexes for current query, load
   * the missing one. This is a temporary cludge until real metadata cache becomes available.
   */
  private void updateLoadedIndexes(OrcStripeMetadata stripeMetadata,
      StripeInformation stripe, boolean[] stripeIncludes, boolean[] sargColumns) throws IOException {
    // We only synchronize on write for now - design of metadata cache is very temporary;
    // we pre-allocate the array and never remove entries; so readers should be safe.
    synchronized (stripeMetadata) {
      if (stripeMetadata.hasAllIndexes(stripeIncludes)) return;
      long startTime = counters.startTimeCounter();
      stripeMetadata.loadMissingIndexes(metadataReader, stripe, stripeIncludes, sargColumns);
      counters.incrTimeCounter(LlapIOCounters.HDFS_TIME_NS, startTime);
    }
  }

  /**
   * Closes the stripe readers (on error).
   */
  private void cleanupReaders() {
    if (stripeReader != null) {
      try {
        stripeReader.close();
      } catch (IOException ex) {
        // Ignore.
      }
    }
    if (metadataReader != null) {
      try {
        metadataReader.close();
      } catch (IOException ex) {
        // Ignore.
      }
    }
  }

  /**
   * Ensures orcReader is initialized for the split.
   */
  private void ensureOrcReader() throws IOException {
    if (orcReader != null) return;
    path = split.getPath();
    if (fileKey instanceof Long && HiveConf.getBoolVar(
        daemonConf, ConfVars.LLAP_IO_USE_FILEID_PATH)) {
      path = HdfsUtils.getFileIdPath(fs, path, (long)fileKey);
    }
    LlapIoImpl.ORC_LOGGER.trace("Creating reader for {} ({})", path, split.getPath());
    long startTime = counters.startTimeCounter();
    ReaderOptions opts = OrcFile.readerOptions(jobConf).filesystem(fs).fileMetadata(fileMetadata);
    if (split instanceof OrcSplit) {
      OrcTail orcTail = ((OrcSplit) split).getOrcTail();
      if (orcTail != null) {
        LlapIoImpl.ORC_LOGGER.debug("Setting OrcTail. path={}", path);
        opts.orcTail(orcTail);
      }
    }
    orcReader = EncodedOrcFile.createReader(path, opts);
    counters.incrTimeCounter(LlapIOCounters.HDFS_TIME_NS, startTime);
  }

  /**
   *  Gets file metadata for the split from cache, or reads it from the file.
   */
  private OrcFileMetadata getOrReadFileMetadata() throws IOException {
    OrcFileMetadata metadata = null;
    if (fileKey != null && metadataCache != null) {
      metadata = metadataCache.getFileMetadata(fileKey);
      if (metadata != null) {
        counters.incrCounter(LlapIOCounters.METADATA_CACHE_HIT);
        return metadata;
      } else {
        counters.incrCounter(LlapIOCounters.METADATA_CACHE_MISS);
      }
    }
    ensureOrcReader();
    // We assume this call doesn't touch HDFS because everything is already read; don't add time.
    metadata = new OrcFileMetadata(fileKey, orcReader);
    if (fileKey == null || metadataCache == null) return metadata;
    return metadataCache.putFileMetadata(metadata);
  }

  /**
   * Reads the metadata for all stripes in the file.
   */
  private ArrayList<OrcStripeMetadata> readStripesMetadata(
      boolean[] globalInc, boolean[] sargColumns) throws IOException {
    ArrayList<OrcStripeMetadata> result = new ArrayList<OrcStripeMetadata>(readState.length);
    boolean hasFileId = this.fileKey != null;
    OrcBatchKey stripeKey = hasFileId ? new OrcBatchKey(fileKey, 0, 0) : null;
    for (int stripeIxMod = 0; stripeIxMod < readState.length; ++stripeIxMod) {
      OrcStripeMetadata value = null;
      int stripeIx = stripeIxMod + stripeIxFrom;
      if (hasFileId && metadataCache != null) {
        stripeKey.stripeIx = stripeIx;
        value = metadataCache.getStripeMetadata(stripeKey);
      }
      if (value == null || !value.hasAllIndexes(globalInc)) {
        counters.incrCounter(LlapIOCounters.METADATA_CACHE_MISS);
        ensureMetadataReader();
        StripeInformation si = fileMetadata.getStripes().get(stripeIx);
        if (value == null) {
          long startTime = counters.startTimeCounter();
          value = new OrcStripeMetadata(new OrcBatchKey(fileKey, stripeIx, 0),
              metadataReader, si, globalInc, sargColumns, orcReader.getSchema(),
              orcReader.getWriterVersion());
          counters.incrTimeCounter(LlapIOCounters.HDFS_TIME_NS, startTime);
          if (hasFileId && metadataCache != null) {
            value = metadataCache.putStripeMetadata(value);
            if (LlapIoImpl.ORC_LOGGER.isTraceEnabled()) {
              LlapIoImpl.ORC_LOGGER.trace("Caching stripe {} metadata with includes: {}",
                  stripeKey.stripeIx, DebugUtils.toString(globalInc));
            }
          }
        }
        // We might have got an old value from cache; recheck it has indexes.
        if (!value.hasAllIndexes(globalInc)) {
          if (LlapIoImpl.ORC_LOGGER.isTraceEnabled()) {
            LlapIoImpl.ORC_LOGGER.trace("Updating indexes in stripe {} metadata for includes: {}",
                stripeKey.stripeIx, DebugUtils.toString(globalInc));
          }
          updateLoadedIndexes(value, si, globalInc, sargColumns);
        }
      } else {
        counters.incrCounter(LlapIOCounters.METADATA_CACHE_HIT);
      }
      result.add(value);
      consumer.setStripeMetadata(value);
    }
    return result;
  }

  private void ensureMetadataReader() throws IOException {
    ensureOrcReader();
    if (metadataReader != null) return;
    long startTime = counters.startTimeCounter();
    boolean useZeroCopy = (daemonConf != null) && OrcConf.USE_ZEROCOPY.getBoolean(daemonConf);
    metadataReader = RecordReaderUtils.createDefaultDataReader(
        DataReaderProperties.builder()
        .withBufferSize(orcReader.getCompressionSize())
        .withCompression(orcReader.getCompressionKind())
        .withFileSystem(fs)
        .withPath(path)
        .withTypeCount(orcReader.getSchema().getMaximumId() + 1)
        .withZeroCopy(useZeroCopy)
        .build());
    counters.incrTimeCounter(LlapIOCounters.HDFS_TIME_NS, startTime);
  }

  @Override
  public void returnData(OrcEncodedColumnBatch ecb) {
    for (int colIx = 0; colIx < ecb.getTotalColCount(); ++colIx) {
      if (!ecb.hasData(colIx)) continue;
      ColumnStreamData[] datas = ecb.getColumnData(colIx);
      for (ColumnStreamData data : datas) {
        if (data == null || data.decRef() != 0) continue;
        if (LlapIoImpl.LOCKING_LOGGER.isTraceEnabled()) {
          for (MemoryBuffer buf : data.getCacheBuffers()) {
            LlapIoImpl.LOCKING_LOGGER.trace("Unlocking {} at the end of processing", buf);
          }
        }
        bufferManager.decRefBuffers(data.getCacheBuffers());
        CSD_POOL.offer(data);
      }
    }
    // We can offer ECB even with some streams not discarded; reset() will clear the arrays.
    ECB_POOL.offer(ecb);
  }

  /**
   * Determines which RGs need to be read, after stripes have been determined.
   * SARG is applied, and readState is populated for each stripe accordingly.
   */
  private boolean determineRgsToRead(boolean[] globalIncludes, int rowIndexStride,
      ArrayList<OrcStripeMetadata> metadata) throws IOException {
    RecordReaderImpl.SargApplier sargApp = null;
    if (sarg != null && rowIndexStride != 0) {
      sargApp = new RecordReaderImpl.SargApplier(sarg,
          rowIndexStride, evolution,
          OrcFile.WriterVersion.from(fileMetadata.getWriterVersionNum()));
    }
    boolean hasAnyData = false;
    // readState should have been initialized by this time with an empty array.
    for (int stripeIxMod = 0; stripeIxMod < readState.length; ++stripeIxMod) {
      int stripeIx = stripeIxMod + stripeIxFrom;
      StripeInformation stripe = fileMetadata.getStripes().get(stripeIx);
      int rgCount = getRgCount(stripe, rowIndexStride);
      boolean[] rgsToRead = null;
      if (sargApp != null) {
        OrcStripeMetadata stripeMetadata = metadata.get(stripeIxMod);
        rgsToRead = sargApp.pickRowGroups(stripe, stripeMetadata.getRowIndexes(),
            stripeMetadata.getBloomFilterKinds(),
            stripeMetadata.getEncodings(),
            stripeMetadata.getBloomFilterIndexes(), true);
      }
      boolean isNone = rgsToRead == RecordReaderImpl.SargApplier.READ_NO_RGS,
          isAll = rgsToRead == RecordReaderImpl.SargApplier.READ_ALL_RGS;
      hasAnyData = hasAnyData || !isNone;
      if (LlapIoImpl.ORC_LOGGER.isTraceEnabled()) {
        if (isNone) {
          LlapIoImpl.ORC_LOGGER.trace("SARG eliminated all RGs for stripe {}", stripeIx);
        } else if (!isAll) {
          LlapIoImpl.ORC_LOGGER.trace("SARG picked RGs for stripe {}: {}",
              stripeIx, DebugUtils.toString(rgsToRead));
        } else {
          LlapIoImpl.ORC_LOGGER.trace("Will read all {} RGs for stripe {}", rgCount, stripeIx);
        }
      }
      assert isAll || isNone || rgsToRead.length == rgCount;
      int fileIncludesCount = 0;
      // TODO: hacky for now - skip the root 0-s column.
      //        We don't need separate readState w/o HL cache, should get rid of that instead.
      for (int includeIx = 1; includeIx < globalIncludes.length; ++includeIx) {
        fileIncludesCount += (globalIncludes[includeIx] ? 1 : 0);
      }
      readState[stripeIxMod] = new boolean[fileIncludesCount][];
      for (int includeIx = 0; includeIx < fileIncludesCount; ++includeIx) {
        readState[stripeIxMod][includeIx] = (isAll || isNone) ? rgsToRead :
          Arrays.copyOf(rgsToRead, rgsToRead.length);
      }
      adjustRgMetric(rgCount, rgsToRead, isNone, isAll);
    }
    return hasAnyData;
  }

  private void adjustRgMetric(int rgCount, boolean[] rgsToRead, boolean isNone,
      boolean isAll) {
    int count = 0;
    if (!isAll) {
      for (boolean b : rgsToRead) {
        if (b)
          count++;
      }
    } else if (!isNone) {
      count = rgCount;
    }
    counters.incrCounter(LlapIOCounters.SELECTED_ROWGROUPS, count);
  }


  private int getRgCount(StripeInformation stripe, int rowIndexStride) {
    return (int)Math.ceil((double)stripe.getNumberOfRows() / rowIndexStride);
  }

  /**
   * Determine which stripes to read for a split. Populates stripeIxFrom and readState.
   */
  public void determineStripesToRead() {
    // The unit of caching for ORC is (rg x column) (see OrcBatchKey).
    List<StripeInformation> stripes = fileMetadata.getStripes();
    long offset = split.getStart(), maxOffset = offset + split.getLength();
    stripeIxFrom = -1;
    int stripeIxTo = -1;
    if (LlapIoImpl.ORC_LOGGER.isDebugEnabled()) {
      String tmp = "FileSplit {" + split.getStart() + ", " + split.getLength() + "}; stripes ";
      for (StripeInformation stripe : stripes) {
        tmp += "{" + stripe.getOffset() + ", " + stripe.getLength() + "}, ";
      }
      LlapIoImpl.ORC_LOGGER.debug(tmp);
    }

    int stripeIx = 0;
    for (StripeInformation stripe : stripes) {
      long stripeStart = stripe.getOffset();
      if (offset > stripeStart) {
        // We assume splits will never start in the middle of the stripe.
        ++stripeIx;
        continue;
      }
      if (stripeIxFrom == -1) {
        LlapIoImpl.ORC_LOGGER.trace("Including stripes from {} ({} >= {})",
            stripeIx, stripeStart, offset);
        stripeIxFrom = stripeIx;
      }
      if (stripeStart >= maxOffset) {
        stripeIxTo = stripeIx;
        LlapIoImpl.ORC_LOGGER.trace("Including stripes until {} ({} >= {}); {} stripes",
            stripeIxTo, stripeStart, maxOffset, (stripeIxTo - stripeIxFrom));
        break;
      }
      ++stripeIx;
    }
    if (stripeIxFrom == -1) {
      LlapIoImpl.LOG.info("Not including any stripes - empty split");
    }
    if (stripeIxTo == -1 && stripeIxFrom != -1) {
      stripeIxTo = stripeIx;
      LlapIoImpl.ORC_LOGGER.trace("Including stripes until {} (end of file); {} stripes",
          stripeIx, (stripeIxTo - stripeIxFrom));
    }
    readState = new boolean[stripeIxTo - stripeIxFrom][][];
  }

  private class DataWrapperForOrc implements DataReader, DataCache {
    private final DataReader orcDataReader;

    private DataWrapperForOrc(DataWrapperForOrc other) {
      orcDataReader = other.orcDataReader.clone();
    }

    public DataWrapperForOrc() throws IOException {
      ensureMetadataReader();
      this.orcDataReader = metadataReader.clone();
    }

    @Override
    public DiskRangeList getFileData(Object fileKey, DiskRangeList range,
        long baseOffset, DiskRangeListFactory factory, BooleanRef gotAllData) {
      DiskRangeList result = lowLevelCache.getFileData(
          fileKey, range, baseOffset, factory, counters, gotAllData);
      if (LlapIoImpl.ORC_LOGGER.isTraceEnabled()) {
        LlapIoImpl.ORC_LOGGER.trace("Disk ranges after data cache (file " + fileKey +
            ", base offset " + baseOffset + "): " + RecordReaderUtils.stringifyDiskRanges(result));
      }
      if (gotAllData.value) return result;
      return (metadataCache == null) ? result
          : metadataCache.getIncompleteCbs(fileKey, result, baseOffset, factory, gotAllData);
    }

    @Override
    public long[] putFileData(Object fileKey, DiskRange[] ranges,
        MemoryBuffer[] data, long baseOffset) {
      if (data != null) {
        return lowLevelCache.putFileData(
            fileKey, ranges, data, baseOffset, Priority.NORMAL, counters);
      } else if (metadataCache != null) {
        metadataCache.putIncompleteCbs(fileKey, ranges, baseOffset);
      }
      return null;
    }

    @Override
    public void releaseBuffer(MemoryBuffer buffer) {
      bufferManager.decRefBuffer(buffer);
    }

    @Override
    public void reuseBuffer(MemoryBuffer buffer) {
      boolean isReused = bufferManager.incRefBuffer(buffer);
      assert isReused;
    }

    @Override
    public Allocator getAllocator() {
      return bufferManager.getAllocator();
    }

    @Override
    public void close() throws IOException {
      orcDataReader.close();
      if (metadataReader != null) {
        metadataReader.close();
      }
    }

    @Override
    public DiskRangeList readFileData(DiskRangeList range, long baseOffset,
        boolean doForceDirect) throws IOException {
      long startTime = counters.startTimeCounter();
      DiskRangeList result = orcDataReader.readFileData(range, baseOffset, doForceDirect);
      counters.recordHdfsTime(startTime);
      if (LlapIoImpl.ORC_LOGGER.isTraceEnabled()) {
        LlapIoImpl.ORC_LOGGER.trace("Disk ranges after disk read (file {}, base offset {}): {}",
            fileKey, baseOffset, RecordReaderUtils.stringifyDiskRanges(result));
      }
      return result;
    }

    @Override
    public boolean isTrackingDiskRanges() {
      return orcDataReader.isTrackingDiskRanges();
    }

    @Override
    public void releaseBuffer(ByteBuffer buffer) {
      orcDataReader.releaseBuffer(buffer);
    }

    @Override
    public DataWrapperForOrc clone() {
      return new DataWrapperForOrc(this);
    }

    @Override
    public void open() throws IOException {
      long startTime = counters.startTimeCounter();
      orcDataReader.open();
      counters.recordHdfsTime(startTime);
    }

    @Override
    public OrcIndex readRowIndex(StripeInformation stripe,
                                 TypeDescription fileSchema,
                                 OrcProto.StripeFooter footer,
                                 boolean ignoreNonUtf8BloomFilter,
                                 boolean[] included,
                                 OrcProto.RowIndex[] indexes,
                                 boolean[] sargColumns,
                                 org.apache.orc.OrcFile.WriterVersion version,
                                 OrcProto.Stream.Kind[] bloomFilterKinds,
                                 OrcProto.BloomFilterIndex[] bloomFilterIndices
                                 ) throws IOException {
      return orcDataReader.readRowIndex(stripe, fileSchema, footer,
          ignoreNonUtf8BloomFilter, included, indexes,
          sargColumns, version, bloomFilterKinds, bloomFilterIndices);
    }

    @Override
    public OrcProto.StripeFooter readStripeFooter(StripeInformation stripe) throws IOException {
      return orcDataReader.readStripeFooter(stripe);
    }
  }

  @Override
  public TezCounters getTezCounters() {
    return counters.getTezCounters();
  }
}