SerDeEncodedDataReader.java example

Explorer
hive-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.llap.io.encoded;

import org.apache.orc.impl.MemoryManager;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.security.PrivilegedExceptionAction;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.Pool.PoolObjectHelper;
import org.apache.hadoop.hive.common.io.DataCache.BooleanRef;
import org.apache.hadoop.hive.common.io.DiskRangeList;
import org.apache.hadoop.hive.common.io.DataCache.DiskRangeListFactory;
import org.apache.hadoop.hive.common.io.encoded.EncodedColumnBatch.ColumnStreamData;
import org.apache.hadoop.hive.common.io.encoded.MemoryBuffer;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
import org.apache.hadoop.hive.llap.ConsumerFeedback;
import org.apache.hadoop.hive.llap.DebugUtils;
import org.apache.hadoop.hive.llap.cache.BufferUsageManager;
import org.apache.hadoop.hive.llap.cache.LlapDataBuffer;
import org.apache.hadoop.hive.llap.cache.LowLevelCache.Priority;
import org.apache.hadoop.hive.llap.cache.SerDeLowLevelCacheImpl;
import org.apache.hadoop.hive.llap.cache.SerDeLowLevelCacheImpl.FileData;
import org.apache.hadoop.hive.llap.cache.SerDeLowLevelCacheImpl.StripeData;
import org.apache.hadoop.hive.llap.counters.LlapIOCounters;
import org.apache.hadoop.hive.llap.counters.QueryFragmentCounters;
import org.apache.hadoop.hive.llap.io.api.impl.LlapIoImpl;
import org.apache.hadoop.hive.llap.io.decode.GenericColumnVectorProducer.SerDeStripeMetadata;
import org.apache.hadoop.hive.llap.io.decode.OrcEncodedDataConsumer;
import org.apache.hadoop.hive.llap.io.encoded.SerDeEncodedDataReader.CacheWriter;
import org.apache.hadoop.hive.llap.io.encoded.VectorDeserializeOrcWriter.AsyncCallback;
import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.hadoop.hive.ql.io.HdfsUtils;
import org.apache.hadoop.hive.ql.io.orc.OrcFile;
import org.apache.hadoop.hive.ql.io.orc.OrcFile.WriterOptions;
import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat;
import org.apache.hadoop.hive.ql.io.orc.Reader;
import org.apache.hadoop.hive.ql.io.orc.Writer;
import org.apache.hadoop.hive.ql.io.orc.encoded.CacheChunk;
import org.apache.hadoop.hive.ql.io.orc.encoded.Reader.OrcEncodedColumnBatch;
import org.apache.hadoop.hive.ql.plan.PartitionDesc;
import org.apache.hadoop.hive.serde2.Deserializer;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.LineRecordReader;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SplitLocationInfo;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hive.common.util.FixedSizedObjectPool;
import org.apache.hive.common.util.Ref;
import org.apache.orc.CompressionCodec;
import org.apache.orc.CompressionKind;
import org.apache.orc.OrcConf;
import org.apache.orc.OrcUtils;
import org.apache.orc.OrcFile.EncodingStrategy;
import org.apache.orc.OrcFile.Version;
import org.apache.orc.OrcProto;
import org.apache.orc.OrcProto.ColumnEncoding;
import org.apache.orc.TypeDescription;
import org.apache.orc.impl.OutStream;
import org.apache.orc.PhysicalWriter;
import org.apache.orc.PhysicalWriter.OutputReceiver;
import org.apache.orc.impl.SchemaEvolution;
import org.apache.orc.impl.StreamName;
import org.apache.tez.common.CallableWithNdc;
import org.apache.tez.common.counters.TezCounters;

import com.google.common.collect.Lists;

public class SerDeEncodedDataReader extends CallableWithNdc<Void>
    implements ConsumerFeedback<OrcEncodedColumnBatch>, TezCounterSource {

  public static final FixedSizedObjectPool<ColumnStreamData> CSD_POOL =
      new FixedSizedObjectPool<>(8192, new PoolObjectHelper<ColumnStreamData>() {
        @Override
        public ColumnStreamData create() {
          return new ColumnStreamData();
        }
        @Override
        public void resetBeforeOffer(ColumnStreamData t) {
          t.reset();
        }
      });
  public static final FixedSizedObjectPool<OrcEncodedColumnBatch> ECB_POOL =
      new FixedSizedObjectPool<>(1024, new PoolObjectHelper<OrcEncodedColumnBatch>() {
        @Override
        public OrcEncodedColumnBatch create() {
          return new OrcEncodedColumnBatch();
        }
        @Override
        public void resetBeforeOffer(OrcEncodedColumnBatch t) {
          t.reset();
        }
      });
  public static final FixedSizedObjectPool<CacheChunk> TCC_POOL =
      new FixedSizedObjectPool<>(1024, new PoolObjectHelper<CacheChunk>() {
      @Override
      public CacheChunk create() {
        return new CacheChunk();
      }
      @Override
      public void resetBeforeOffer(CacheChunk t) {
        t.reset();
      }
    });
  private final static DiskRangeListFactory CC_FACTORY = new DiskRangeListFactory() {
    @Override
    public DiskRangeList createCacheChunk(MemoryBuffer buffer, long offset, long end) {
      CacheChunk tcc = TCC_POOL.take();
      tcc.init(buffer, offset, end);
      return tcc;
    }
  };

  private final SerDeLowLevelCacheImpl cache;
  private final BufferUsageManager bufferManager;
  private final Configuration daemonConf;
  private final FileSplit split;
  private List<Integer> columnIds;
  private final OrcEncodedDataConsumer consumer;
  private final QueryFragmentCounters counters;
  private final UserGroupInformation ugi;
  private final Map<Path, PartitionDesc> parts;

  private final Object fileKey;
  private final FileSystem fs;

  private volatile boolean isStopped = false;
  private final Deserializer sourceSerDe;
  private final InputFormat<?, ?> sourceInputFormat;
  private final Reporter reporter;
  private final JobConf jobConf;
  private final TypeDescription schema;
  private final int allocSize;
  private final int targetSliceRowCount;
  private final boolean isLrrEnabled;

  private final boolean[] writerIncludes;
  private FileReaderYieldReturn currentFileRead = null;

  /**
   * Data from cache currently being processed. We store it here so that we could decref
   * it in case of failures. We remove each slice from the data after it has been sent to
   * the consumer, at which point the consumer is responsible for it.
   */
  private FileData cachedData;
  private List<VectorDeserializeOrcWriter> asyncWriters = new ArrayList<>();

  public SerDeEncodedDataReader(SerDeLowLevelCacheImpl cache,
      BufferUsageManager bufferManager, Configuration daemonConf, FileSplit split,
      List<Integer> columnIds, OrcEncodedDataConsumer consumer, JobConf jobConf, Reporter reporter,
      InputFormat<?, ?> sourceInputFormat, Deserializer sourceSerDe,
      QueryFragmentCounters counters, TypeDescription schema, Map<Path, PartitionDesc> parts)
          throws IOException {
    this.cache = cache;
    this.bufferManager = bufferManager;
    this.parts = parts;
    this.daemonConf = new Configuration(daemonConf);
    // Disable dictionary encoding for the writer.
    this.daemonConf.setDouble(OrcConf.DICTIONARY_KEY_SIZE_THRESHOLD.name(), 0);
    this.split = split;
    this.columnIds = columnIds;
    this.allocSize = determineAllocSize(bufferManager, daemonConf);
    boolean isInTest = HiveConf.getBoolVar(daemonConf, ConfVars.HIVE_IN_TEST);
    Configuration sliceConf = isInTest ? jobConf : daemonConf;
    this.targetSliceRowCount = HiveConf.getIntVar(
        sliceConf, ConfVars.LLAP_IO_ENCODE_SLICE_ROW_COUNT);
    this.isLrrEnabled = HiveConf.getBoolVar(sliceConf, ConfVars.LLAP_IO_ENCODE_SLICE_LRR);
    if (this.columnIds != null) {
      Collections.sort(this.columnIds);
    }
    this.consumer = consumer;
    this.counters = counters;
    try {
      this.ugi = UserGroupInformation.getCurrentUser();
    } catch (IOException e) {
      throw new RuntimeException(e);
    }

    fs = split.getPath().getFileSystem(daemonConf);
    fileKey = determineFileId(fs, split,
        HiveConf.getBoolVar(daemonConf, ConfVars.LLAP_CACHE_ALLOW_SYNTHETIC_FILEID));
    this.sourceInputFormat = sourceInputFormat;
    this.sourceSerDe = sourceSerDe;
    this.reporter = reporter;
    this.jobConf = jobConf;
    this.schema = schema;
    this.writerIncludes = OrcInputFormat.genIncludedColumns(schema, columnIds);
    SchemaEvolution evolution = new SchemaEvolution(schema, null,
        new Reader.Options(jobConf).include(writerIncludes));
    consumer.setSchemaEvolution(evolution);
  }

  private static int determineAllocSize(BufferUsageManager bufferManager, Configuration conf) {
    long allocSize = HiveConf.getSizeVar(conf, ConfVars.LLAP_IO_ENCODE_ALLOC_SIZE);
    int maxAllocSize = bufferManager.getAllocator().getMaxAllocation();
    if (allocSize > maxAllocSize) {
      LlapIoImpl.LOG.error("Encode allocation size " + allocSize + " is being capped to the maximum "
          + "allocation size " + bufferManager.getAllocator().getMaxAllocation());
      allocSize = maxAllocSize;
    }
    return (int)allocSize;
  }

  @Override
  public void stop() {
    LlapIoImpl.LOG.debug("Encoded reader is being stopped");
    isStopped = true;
  }

  @Override
  public void pause() {
    throw new UnsupportedOperationException();
  }

  @Override
  public void unpause() {
    throw new UnsupportedOperationException();
  }

  // TODO: move to a base class?
  @Override
  protected Void callInternal() throws IOException, InterruptedException {
    return ugi.doAs(new PrivilegedExceptionAction<Void>() {
      @Override
      public Void run() throws Exception {
        return performDataRead();
      }
    });
  }

  /** A row-based (Writable) reader that may also be able to report file offsets. */
  interface ReaderWithOffsets {
    /** Moves the reader to the next row. */
    boolean next() throws IOException;
    /** Gets the current row. */
    Writable getCurrentRow();
    /** Closes the reader. */
    void close() throws IOException;

    /** Whether this reader actually supports offsets. */
    boolean hasOffsets();
    /** Gets the start offset of the current row, or -1 if unknown. */
    long getCurrentRowStartOffset();
    /** Gets the end offset of the current row, or -1 if unknown. */
    long getCurrentRowEndOffset();
  }

  public static class CacheWriter implements PhysicalWriter {
    // Struct.
    private static class CacheStreamData {
      private final List<MemoryBuffer> data;
      private final boolean isSuppressed;
      private final StreamName name;
      public CacheStreamData(boolean isSuppressed, StreamName name, List<MemoryBuffer> data) {
        this.isSuppressed = isSuppressed;
        this.name = name;
        this.data = data;
      }
      @Override
      public String toString() {
        return "CacheStreamData [name=" + name + ", isSuppressed="
            + isSuppressed + ", data=" + toString(data) + "]";
      }
      private static String toString(List<MemoryBuffer> data) {
        String s = "";
        for (MemoryBuffer buffer : data) {
          s += LlapDataBuffer.toDataString(buffer) + ", ";
        }
        return s;
      }
    }

    private static class CacheStripeData {
      private List<ColumnEncoding> encodings;
      private long rowCount = -1;
      private long knownTornStart, firstRowStart, lastRowStart, lastRowEnd;
      private Map<Integer, List<CacheStreamData>> colStreams = new HashMap<>();
      @Override
      public String toString() {
        return ("{disk data knownTornStart=" + knownTornStart
            + ", firstRowStart=" + firstRowStart + ", lastRowStart="
            + lastRowStart + ", lastRowEnd=" + lastRowEnd + ", rowCount=" + rowCount
            + ", encodings=" + encodings + ", streams=" + colStreams + "}").replace('\n', ' ');
      }

      public String toCoordinateString() {
        return "knownTornStart=" + knownTornStart + ", firstRowStart=" + firstRowStart
            + ", lastRowStart=" + lastRowStart + ", lastRowEnd=" + lastRowEnd;
      }
    }

    private CacheStripeData currentStripe;
    private final List<CacheStripeData> stripes = new ArrayList<>();
    private final BufferUsageManager bufferManager;
    /**
     * For !doesSourceHaveIncludes case, stores global column IDs to verify writer columns.
     * For doesSourceHaveIncludes case, stores source column IDs used to map things.
     */
    private final List<Integer> columnIds;
    private final boolean[] writerIncludes;
    // These are global since ORC reuses objects between stripes.
    private final Map<StreamName, OutputReceiver> streams = new HashMap<>();
    private final Map<Integer, List<CacheOutputReceiver>> colStreams = new HashMap<>();
    private final boolean doesSourceHaveIncludes;

    public CacheWriter(BufferUsageManager bufferManager, List<Integer> columnIds,
        boolean[] writerIncludes, boolean doesSourceHaveIncludes) {
      this.bufferManager = bufferManager;
      assert writerIncludes != null; // Taken care of on higher level.
      this.writerIncludes = writerIncludes;
      this.doesSourceHaveIncludes = doesSourceHaveIncludes;
      this.columnIds = columnIds;
      startStripe();
    }

    private void startStripe() {
      if (currentStripe != null) {
        stripes.add(currentStripe);
      }
      currentStripe = new CacheStripeData();
    }

    @Override
    public void writeFileMetadata(OrcProto.Metadata.Builder builder) throws IOException {
    }

    @Override
    public void writeFileFooter(OrcProto.Footer.Builder builder) throws IOException {
      OrcProto.Footer footer = builder.build();
      validateIncludes(footer);
    }

    public void validateIncludes(OrcProto.Footer footer) throws IOException {
      if (doesSourceHaveIncludes) return; // Irrelevant.
      boolean[] translatedIncludes = columnIds == null ? null : OrcInputFormat.genIncludedColumns(
          OrcUtils.convertTypeFromProtobuf(footer.getTypesList(), 0), columnIds);
      if (translatedIncludes == null) {
        throwIncludesMismatchError(translatedIncludes);
      }
      int len = Math.min(translatedIncludes.length, writerIncludes.length);
      for (int i = 0; i < len; ++i) {
        // Translated includes may be a superset of writer includes due to cache.
        if (!translatedIncludes[i] && writerIncludes[i]) {
          throwIncludesMismatchError(translatedIncludes);
        }
      }
      if (translatedIncludes.length < writerIncludes.length) {
        for (int i = len; i < writerIncludes.length; ++i) {
          if (writerIncludes[i]) {
            throwIncludesMismatchError(translatedIncludes);
          }
        }
      }

    }

    private String throwIncludesMismatchError(boolean[] translated) throws IOException {
      String s = "Includes derived from the original table: " + DebugUtils.toString(writerIncludes)
          + " but the ones derived from writer types are: " + DebugUtils.toString(translated);
      LlapIoImpl.LOG.error(s);
      throw new IOException(s);
    }

    @Override
    public long writePostScript(OrcProto.PostScript.Builder builder) {
      return 0;
    }

    @Override
    public void close() throws IOException {
      // Closed from ORC writer, we still need the data. Do not discard anything.
    }

    public void discardData() {
      LlapIoImpl.LOG.debug("Discarding disk data (if any wasn't cached)");
      for (CacheStripeData stripe : stripes) {
        if (stripe.colStreams == null || stripe.colStreams.isEmpty()) continue;
        for (List<CacheStreamData> streams : stripe.colStreams.values()) {
          for (CacheStreamData cos : streams) {
            for (MemoryBuffer buffer : cos.data) {
              if (LlapIoImpl.CACHE_LOGGER.isTraceEnabled()) {
                LlapIoImpl.CACHE_LOGGER.trace("Deallocating " + buffer);
              }
              bufferManager.getAllocator().deallocate(buffer);
            }
          }
        }
        stripe.colStreams.clear();
      }
    }

    @Override
    public OutputReceiver createDataStream(StreamName name) throws IOException {
      OutputReceiver or = streams.get(name);
      if (or != null) return or;
      if (isNeeded(name)) {
        if (LlapIoImpl.LOG.isTraceEnabled()) {
          LlapIoImpl.LOG.trace("Creating cache receiver for " + name);
        }
        CacheOutputReceiver cor = new CacheOutputReceiver(bufferManager, name);
        or = cor;
        List<CacheOutputReceiver> list = colStreams.get(name.getColumn());
        if (list == null) {
          list = new ArrayList<>();
          colStreams.put(name.getColumn(), list);
        }
        list.add(cor);
      } else {
        if (LlapIoImpl.LOG.isTraceEnabled()) {
          LlapIoImpl.LOG.trace("Creating null receiver for " + name);
        }
        or = new NullOutputReceiver(name);
      }
      streams.put(name, or);
      return or;
    }

    @Override
    public void writeHeader() throws IOException {
    }

    @Override
    public void writeIndex(StreamName name, OrcProto.RowIndex.Builder index,
        CompressionCodec codec) throws IOException {
      // TODO: right now we treat each slice as a stripe with a single RG and never bother
      //       with indexes. In phase 4, we need to add indexing and filtering.
    }

    @Override
    public void writeBloomFilter(StreamName name, OrcProto.BloomFilterIndex.Builder bloom,
        CompressionCodec codec) throws IOException {
    }

    @Override
    public void finalizeStripe(
        OrcProto.StripeFooter.Builder footer,
        OrcProto.StripeInformation.Builder dirEntry)
        throws IOException {
      List<ColumnEncoding> allEnc = footer.getColumnsList();
      OrcProto.StripeInformation si = dirEntry.build();
      if (LlapIoImpl.LOG.isTraceEnabled()) {
        LlapIoImpl.LOG.trace(("Finalizing stripe " + footer.build() + " => " + si)
            .replace('\n', ' '));
      }
      if (doesSourceHaveIncludes) {
        currentStripe.encodings = new ArrayList<>(writerIncludes.length);
        for (int i = 0; i < writerIncludes.length; ++i) {
          currentStripe.encodings.add(null);
        }
        currentStripe.encodings.set(0, allEnc.get(0));
        for (int i = 1; i < allEnc.size(); ++i) {
          int colIx = getSparseOrcIndexFromDenseDest(i);
          // LlapIoImpl.LOG.info("Setting enc " + i + "; " + colIx + " to " + allEnc.get(i));
          currentStripe.encodings.set(colIx, allEnc.get(i));
        }
      } else {
        currentStripe.encodings = new ArrayList<>(allEnc);
        for (int i = 0; i < currentStripe.encodings.size(); ++i) {
          // Don't record encodings for unneeded columns.
          if (writerIncludes[i]) continue;
          currentStripe.encodings.set(i, null);
        }
      }
      currentStripe.rowCount = si.getNumberOfRows();
      // ORC writer reuses streams, so we need to clean them here and extract data.
      for (Map.Entry<Integer, List<CacheOutputReceiver>> e : colStreams.entrySet()) {
        int colIx = e.getKey();
        List<CacheOutputReceiver> streams = e.getValue();
        List<CacheStreamData> data = new ArrayList<>(streams.size());
        for (CacheOutputReceiver receiver : streams) {
          List<MemoryBuffer> buffers = receiver.buffers;
          if (buffers == null) {
            // This can happen e.g. for a data stream when all the values are null.
            LlapIoImpl.LOG.debug("Buffers are null for " + receiver.name);
          }
          data.add(new CacheStreamData(receiver.suppressed, receiver.name,
              buffers == null ? new ArrayList<MemoryBuffer>() : new ArrayList<>(buffers)));
          receiver.clear();
        }
        if (doesSourceHaveIncludes) {
          int newColIx = getSparseOrcIndexFromDenseDest(colIx);
          if (LlapIoImpl.LOG.isTraceEnabled()) {
            LlapIoImpl.LOG.trace("Mapping the ORC writer column " + colIx + " to " + newColIx);
          }
          colIx = newColIx;
        }
        currentStripe.colStreams.put(colIx, data);
      }
      startStripe();
    }

    private int getSparseOrcIndexFromDenseDest(int denseColIx) {
      // denseColIx is index in ORC writer with includes. We -1 to skip the root column; get the
      // original text file index; then add the root column again. This makes many assumptions.
      // Also this only works for primitive types; vectordeserializer only supports these anyway.
      // The mapping for complex types with sub-cols in ORC would be much more difficult to build.
      return columnIds.get(denseColIx - 1) + 1;
    }

    private boolean isNeeded(StreamName name) {
      return doesSourceHaveIncludes || writerIncludes[name.getColumn()];
    }

    @Override
    public void flush() throws IOException {
    }

    @Override
    public void appendRawStripe(
        ByteBuffer stripe, OrcProto.StripeInformation.Builder dirEntry) throws IOException {
      throw new UnsupportedOperationException(); // Only used in ACID writer.
    }

    public void setCurrentStripeOffsets(long currentKnownTornStart,
        long firstStartOffset, long lastStartOffset, long currentFileOffset) {
      currentStripe.knownTornStart = currentKnownTornStart;
      currentStripe.firstRowStart = firstStartOffset;
      currentStripe.lastRowStart = lastStartOffset;
      currentStripe.lastRowEnd = currentFileOffset;
    }
  }

  private interface CacheOutput {
    List<MemoryBuffer> getData();
    StreamName getName();
  }

  private static final class CacheOutputReceiver implements CacheOutput, OutputReceiver {
    private final BufferUsageManager bufferManager;
    private final StreamName name;
    private List<MemoryBuffer> buffers = null;
    private int lastBufferPos = -1;
    private boolean suppressed = false;

    public CacheOutputReceiver(BufferUsageManager bufferManager, StreamName name) {
      this.bufferManager = bufferManager;
      this.name = name;
    }

    public void clear() {
      buffers = null;
      lastBufferPos = -1;
      suppressed = false;
    }

    @Override
    public void suppress() {
      suppressed = true;
      lastBufferPos = -1;
    }

    @Override
    public void output(ByteBuffer buffer) throws IOException {
      // TODO: avoid put() by working directly in OutStream?
      if (LlapIoImpl.LOG.isTraceEnabled()) {
        LlapIoImpl.LOG.trace(name + " receiving a buffer of size " + buffer.remaining());
      }
      int size = buffer.remaining();
      ByteBuffer bb = null;
      if (buffers == null) {
        buffers = new ArrayList<>();
      }
      if (!buffers.isEmpty()) {
        MemoryBuffer lastBuffer = buffers.get(buffers.size() - 1);
        bb = lastBuffer.getByteBufferRaw();
        int written = lastBufferPos - bb.position();
        if (bb.remaining() - written < size) {
          lastBufferPos = -1;
          bb = null;
        }
      }
      boolean isNewBuffer = (lastBufferPos == -1);
      if (isNewBuffer) {
        MemoryBuffer[] dest = new MemoryBuffer[1];
        bufferManager.getAllocator().allocateMultiple(dest, size);
        LlapDataBuffer newBuffer = (LlapDataBuffer)dest[0];
        bb = newBuffer.getByteBufferRaw();
        lastBufferPos = bb.position();
        buffers.add(newBuffer);
      }
      // Since there's no close() here, maintain the initial read position between writes.
      int pos = bb.position();
      bb.position(lastBufferPos);
      bb.put(buffer);
      lastBufferPos = bb.position();
      bb.position(pos);
    }

    @Override
    public List<MemoryBuffer> getData() {
      return buffers;
    }

    @Override
    public StreamName getName() {
      return name;
    }
  }

  private static class NullOutputReceiver implements OutputReceiver {
    @SuppressWarnings("unused")
    private final StreamName name;

    public NullOutputReceiver(StreamName name) {
      this.name = name;
    }

    @Override
    public void output(ByteBuffer buffer) throws IOException {
    }

    @Override
    public void suppress() {
    }
  }

  protected Void performDataRead() throws IOException {
    boolean isOk = false;
    try {
      try {
        long startTime = counters.startTimeCounter();
        LlapIoImpl.LOG.info("Processing data for {}", split.getPath());
        if (processStop()) {
          recordReaderTime(startTime);
          return null;
        }
        Boolean isFromCache = null;
        try {
          isFromCache = readFileWithCache(startTime);
        } finally {
          // Note that the code removes the data from the field as it's passed to the consumer,
          // so we expect to have stuff remaining in there only in case of errors.
          if (cachedData != null && cachedData.getData() != null) {
            for (StripeData sd : cachedData.getData()) {
              unlockAllBuffers(sd);
            }
            cachedData = null;
          }
        }
        if (isFromCache == null) return null; // Stop requested, and handled inside.
        if (!isFromCache) {
          if (!processOneFileSplit(split, startTime, Ref.from(0), null)) return null;
        }

        // Done with all the things.
        recordReaderTime(startTime);
        if (LlapIoImpl.LOG.isTraceEnabled()) {
          LlapIoImpl.LOG.trace("done processing {}", split);
        }
      } catch (Throwable e) {
        LlapIoImpl.LOG.error("Exception while processing", e);
        consumer.setError(e);
        throw e;
      }
      consumer.setDone();
      isOk = true;
      return null;
    } finally {
      cleanup(!isOk);
      // Do not clean up the writers - the callback should do it.
    }
  }


  private void unlockAllBuffers(StripeData si) {
    for (int i = 0; i < si.getData().length; ++i) {
      LlapDataBuffer[][] colData = si.getData()[i];
      if (colData == null) continue;
      for (int j = 0; j < colData.length; ++j) {
        LlapDataBuffer[] streamData = colData[j];
        if (streamData == null) continue;
        for (int k = 0; k < streamData.length; ++k) {
          bufferManager.decRefBuffer(streamData[k]);
        }
      }
    }
  }

  public void cacheFileData(StripeData sd) {
    if (sd == null || sd.getEncodings() == null) return;
    if (fileKey != null) {
      // Note that we cache each slice separately. We could cache them together at the end, but
      // then we won't be able to pass them to users without inc-refing explicitly.
      ColumnEncoding[] encodings = sd.getEncodings();
      for (int i = 0; i < encodings.length; ++i) {
        // Make data consistent with encodings, don't store useless information.
        if (sd.getData()[i] == null) {
          encodings[i] = null;
        } else if (encodings[i] == null) {
          throw new AssertionError("Caching data without an encoding at " + i + ": " + sd);
        }
      }
      FileData fd = new FileData(fileKey, encodings.length);
      fd.addStripe(sd);
      cache.putFileData(fd, Priority.NORMAL, counters);
    } else {
      lockAllBuffers(sd);
    }
    // We assume that if put/lock throws in the middle, it's ok to treat buffers as not being
    // locked and to blindly deallocate them, since they are not going to be used. Therefore
    // we don't remove them from the cleanup list - we will do it after sending to consumer.
    // This relies on sequence of calls to cacheFileData and sendEcb..
  }

  private void lockAllBuffers(StripeData sd) {
    for (int i = 0; i < sd.getData().length; ++i) {
      LlapDataBuffer[][] colData = sd.getData()[i];
      if (colData == null) continue;
      for (int j = 0; j < colData.length; ++j) {
        LlapDataBuffer[] streamData = colData[j];
        if (streamData == null) continue;
        for (int k = 0; k < streamData.length; ++k) {
          boolean canLock = bufferManager.incRefBuffer(streamData[k]);
          assert canLock;
        }
      }
    }
  }

  public Boolean readFileWithCache(long startTime) throws IOException {
    if (fileKey == null) return false;
    BooleanRef gotAllData = new BooleanRef();
    long endOfSplit = split.getStart() + split.getLength();
    this.cachedData = cache.getFileData(fileKey, split.getStart(),
        endOfSplit, writerIncludes, CC_FACTORY, counters, gotAllData);
    if (cachedData == null) {
      if (LlapIoImpl.CACHE_LOGGER.isTraceEnabled()) {
        LlapIoImpl.CACHE_LOGGER.trace("No data for the split found in cache");
      }
      return false;
    }
    String[] hosts = extractHosts(split, false), inMemoryHosts = extractHosts(split, true);
    List<StripeData> slices = cachedData.getData();
    if (slices.isEmpty()) return false;
    long uncachedPrefixEnd = slices.get(0).getKnownTornStart(),
        uncachedSuffixStart = slices.get(slices.size() - 1).getLastEnd();
    Ref<Integer> stripeIx = Ref.from(0);
    if (uncachedPrefixEnd > split.getStart()) {
      // TODO: can we merge neighboring splits? So we don't init so many readers.
      FileSplit sliceSplit = new FileSplit(split.getPath(), split.getStart(),
          uncachedPrefixEnd - split.getStart(), hosts, inMemoryHosts);
      if (!processOneFileSplit(sliceSplit, startTime, stripeIx, null)) return null;
    }
    while (!slices.isEmpty()) {
      StripeData slice = slices.get(0);
      long start = slice.getKnownTornStart();
      long len = slice.getLastStart() - start; // Will also read the last row.
      FileSplit sliceSplit = new FileSplit(split.getPath(), start, len, hosts, inMemoryHosts);
      if (!processOneFileSplit(sliceSplit, startTime, stripeIx, slice)) return null;
    }
    boolean isUnfortunate = false;
    if (uncachedSuffixStart == endOfSplit) {
      // This is rather obscure. The end of last row cached is precisely at the split end offset.
      // If the split is in the middle of the file, LRR would read one more row after that,
      // therefore as unfortunate as it is, we have to do a one-row read. However, for that to
      // have happened, someone should have supplied a split that ends inside the last row, i.e.
      // a few bytes earlier than the current split, which is pretty unlikely. What is more likely
      // is that the split, and the last row, both end at the end of file. Check for this.
      long size =  split.getPath().getFileSystem(
          daemonConf).getFileStatus(split.getPath()).getLen();
      isUnfortunate = size > endOfSplit;
      if (isUnfortunate) {
        // Log at warn, given how unfortunate this is.
        LlapIoImpl.LOG.warn("One-row mismatch at the end of split " + split.getPath()
            + " at " + endOfSplit + "; file size is " + size);
      }
    }

    if (uncachedSuffixStart < endOfSplit || isUnfortunate) {
      // Note: we assume 0-length split is correct given now LRR interprets offsets (reading an
      // extra row). Should we instead assume 1+ chars and add 1 for isUnfortunate?
      FileSplit splitPart = new FileSplit(split.getPath(), uncachedSuffixStart,
          endOfSplit - uncachedSuffixStart, hosts, inMemoryHosts);
      if (!processOneFileSplit(splitPart, startTime, stripeIx, null)) return null;
    }
    return true;
  }

  public boolean processOneFileSplit(FileSplit split, long startTime,
      Ref<Integer> stripeIxRef, StripeData slice) throws IOException {
    LlapIoImpl.LOG.info("Processing one split {" + split.getPath() + ", "
        + split.getStart() + ", " + split.getLength() + "}");
    if (LlapIoImpl.CACHE_LOGGER.isTraceEnabled()) {
      LlapIoImpl.CACHE_LOGGER.trace("Cache data for the split is " + slice);
    }
    boolean[] splitIncludes = Arrays.copyOf(writerIncludes, writerIncludes.length);
    boolean hasAllData = slice != null
        && determineSplitIncludes(slice, splitIncludes, writerIncludes);

    // We have 3 cases here:
    // 1) All the data is in the cache. Always a single slice, no disk read, no cache puts.
    // 2) Some data is in the cache. Always a single slice, disk read and a single cache put.
    // 3) No data is in the cache. Multiple slices, disk read and multiple cache puts.
    if (hasAllData) {
      // Everything comes from cache.
      CacheWriter.CacheStripeData csd = null;
      boolean result = processOneSlice(csd, splitIncludes, stripeIxRef.value, slice, startTime);
      ++stripeIxRef.value;
      return result;
    }

    boolean result = false;
    // This initializes currentFileRead.
    startReadSplitFromFile(split, splitIncludes, slice);
    try {
      if (slice != null) {
        // If we had a cache range already, we expect a single matching disk slice.
        Vectors vectors = currentFileRead.readNextSlice();
        if (!vectors.isSupported()) {
          // Not in VRB mode - the new cache data is ready, we should use it.
          CacheWriter cacheWriter = currentFileRead.getCacheWriter();
          assert cacheWriter.stripes.size() == 1;
          result = processOneSlice(
              cacheWriter.stripes.get(0), splitIncludes, stripeIxRef.value, slice, startTime);
        } else {
          // VRB mode - process the VRBs with cache data; the new cache data is coming later.
          result = processOneSlice(
              vectors, splitIncludes, stripeIxRef.value, slice, startTime);
        }
        assert null == currentFileRead.readNextSlice();
        ++stripeIxRef.value;
      } else {
        // All the data comes from disk. The reader may have split it into multiple slices.
        Vectors vectors = currentFileRead.readNextSlice();
        assert vectors != null;
        result = true;
        if (!vectors.isSupported()) {
          // Not in VRB mode - the new cache data is (partially) ready, we should use it.
          while (currentFileRead.readNextSlice() != null); // Force the rest of the data thru.
          CacheWriter cacheWriter = currentFileRead.getCacheWriter();
          for (CacheWriter.CacheStripeData csd : cacheWriter.stripes) {
            if (!processOneSlice(csd, splitIncludes, stripeIxRef.value, null, startTime)) {
              result = false;
              break;
            }
            ++stripeIxRef.value;
          }
        } else {
          // VRB mode - process the VRBs with cache data; the new cache data is coming later.
          do {
            assert vectors.isSupported();
            if (!processOneSlice(vectors, splitIncludes, stripeIxRef.value, null, startTime)) {
              result = false;
              break;
            }
            ++stripeIxRef.value;
          } while ((vectors = currentFileRead.readNextSlice()) != null);
        }
      }
    } finally {
      cleanUpCurrentRead();
    }
    return result;
  }

  private static boolean determineSplitIncludes(
      StripeData slice, boolean[] splitIncludes, boolean[] writerIncludes) {
    ColumnEncoding[] cacheEncodings = slice.getEncodings();
    assert cacheEncodings != null;
    boolean hasAllData = true;
    for (int colIx = 0; colIx < cacheEncodings.length; ++colIx) {
      if (!splitIncludes[colIx]) continue;
      if ((cacheEncodings[colIx] != null) != (slice.getData()[colIx] != null)) {
        throw new AssertionError("Inconsistent cache slice " + slice);
      }
      if (cacheEncodings[colIx] != null) {
        splitIncludes[colIx] = false;
      } else {
        hasAllData = false;
      }
    }
    if (LlapIoImpl.CACHE_LOGGER.isTraceEnabled()) {
      LlapIoImpl.LOG.trace("Includes accounting for cached data: before " + DebugUtils.toString(
        writerIncludes) + ", after " + DebugUtils.toString(splitIncludes));
    }
    return hasAllData;
  }

  private boolean processOneSlice(CacheWriter.CacheStripeData diskData, boolean[] splitIncludes,
      int stripeIx, StripeData cacheData, long startTime) throws IOException {
    logProcessOneSlice(stripeIx, diskData, cacheData);

    ColumnEncoding[] cacheEncodings = cacheData == null ? null : cacheData.getEncodings();
    LlapDataBuffer[][][] cacheBuffers = cacheData == null ? null : cacheData.getData();
    long cacheRowCount = cacheData == null ? -1L : cacheData.getRowCount();
    SerDeStripeMetadata metadata = new SerDeStripeMetadata(stripeIx);
    StripeData sliceToCache = null;
    boolean hasAllData = diskData == null;
    if (!hasAllData) {
      sliceToCache = createSliceToCache(diskData, cacheData);
      metadata.setEncodings(combineCacheAndWriterEncodings(cacheEncodings, diskData.encodings));
      metadata.setRowCount(diskData.rowCount);
    } else {
      metadata.setEncodings(Lists.newArrayList(cacheEncodings));
      metadata.setRowCount(cacheRowCount);
    }
    if (LlapIoImpl.LOG.isTraceEnabled()) {
      LlapIoImpl.LOG.trace("Derived stripe metadata for this split is " + metadata);
    }
    consumer.setStripeMetadata(metadata);

    OrcEncodedColumnBatch ecb = ECB_POOL.take();
    ecb.init(fileKey, metadata.getStripeIx(), OrcEncodedColumnBatch.ALL_RGS, writerIncludes.length);
    for (int colIx = 0; colIx < writerIncludes.length; ++colIx) {
      if (!writerIncludes[colIx]) continue;
      ecb.initColumn(colIx, OrcEncodedColumnBatch.MAX_DATA_STREAMS);
      if (!hasAllData && splitIncludes[colIx]) {
        // The column has been read from disk.
        List<CacheWriter.CacheStreamData> streams = diskData.colStreams.get(colIx);
        LlapDataBuffer[][] newCacheDataForCol = createArrayToCache(sliceToCache, colIx, streams);
        if (streams == null) continue; // Struct column, such as root?
        Iterator<CacheWriter.CacheStreamData> iter = streams.iterator();
        while (iter.hasNext()) {
          CacheWriter.CacheStreamData stream = iter.next();
          if (stream.isSuppressed) {
            if (LlapIoImpl.LOG.isTraceEnabled()) {
              LlapIoImpl.LOG.trace("Removing a suppressed stream " + stream.name);
            }
            iter.remove();
            discardUncachedBuffers(stream.data);
            continue;
          }
          int streamIx = setStreamDataToCache(newCacheDataForCol, stream);
          ColumnStreamData cb = CSD_POOL.take();
          cb.incRef();
          cb.setCacheBuffers(stream.data);
          ecb.setStreamData(colIx, streamIx, cb);
        }
      } else {
        processColumnCacheData(cacheBuffers, ecb, colIx);
      }
    }
    if (processStop()) {
      recordReaderTime(startTime);
      return false;
    }
    // Note: we cache slices one by one since we need to lock them before sending to consumer.
    //       We could lock here, then cache them together, then unlock here and in return,
    //       but for now just rely on the cache put to lock them before we send them over.
    if (LlapIoImpl.CACHE_LOGGER.isTraceEnabled()) {
      LlapIoImpl.CACHE_LOGGER.trace("Data to cache from the read " + sliceToCache);
    }
    cacheFileData(sliceToCache);
    return sendEcbToConsumer(ecb, cacheData != null, diskData);
  }

  private void validateCacheAndDisk(StripeData cacheData,
      long rowCount, long encodingCount, Object diskDataLog) throws IOException {
    if (rowCount != cacheData.getRowCount()) {
      throw new IOException("Row count mismatch; disk " + rowCount + ", cache "
          + cacheData.getRowCount() + " from " + diskDataLog + " and " + cacheData);
    }
    if (encodingCount > 0 && encodingCount != cacheData.getEncodings().length) {
      throw new IOException("Column count mismatch; disk " + encodingCount + ", cache "
          + cacheData.getEncodings().length + " from " + diskDataLog + " and " + cacheData);
    }
  }


  /** Unlike the other overload of processOneSlice, doesn't cache data. */
  private boolean processOneSlice(Vectors diskData, boolean[] splitIncludes,
      int stripeIx, StripeData cacheData, long startTime) throws IOException {
    if (diskData == null) {
      throw new AssertionError(); // The other overload should have been used.
    }
    // LlapIoImpl.LOG.debug("diskData " + diskData);
    logProcessOneSlice(stripeIx, diskData, cacheData);

    if (cacheData == null && diskData.getRowCount() == 0) {
      return true; // Nothing to process.
    }
    ColumnEncoding[] cacheEncodings = cacheData == null ? null : cacheData.getEncodings();
    LlapDataBuffer[][][] cacheBuffers = cacheData == null ? null : cacheData.getData();
    if (cacheData != null) {
      // Don't validate column count - no encodings for vectors.
      validateCacheAndDisk(cacheData, diskData.getRowCount(), -1, diskData);
    }
    SerDeStripeMetadata metadata = new SerDeStripeMetadata(stripeIx);
    metadata.setEncodings(Arrays.asList(cacheEncodings == null
        ? new ColumnEncoding[splitIncludes.length] : cacheEncodings));
    metadata.setRowCount(diskData.getRowCount());
    if (LlapIoImpl.LOG.isTraceEnabled()) {
      LlapIoImpl.LOG.trace("Derived stripe metadata for this split is " + metadata);
    }
    consumer.setStripeMetadata(metadata);

    OrcEncodedColumnBatch ecb = ECB_POOL.take();
    ecb.init(fileKey, metadata.getStripeIx(), OrcEncodedColumnBatch.ALL_RGS, writerIncludes.length);
    int vectorsIx = 0;
    for (int colIx = 0; colIx < writerIncludes.length; ++colIx) {
      if (!writerIncludes[colIx]) continue;
      if (splitIncludes[colIx]) {
        // Skip the 0-th column, since it won't have a vector after reading the text source.
        if (colIx != 0 ) {
          List<ColumnVector> vectors = diskData.getVectors(vectorsIx++);
          if (LlapIoImpl.LOG.isTraceEnabled()) {
            LlapIoImpl.LOG.trace("Processing vectors for column " + colIx + ": " + vectors);
          }
          ecb.initColumnWithVectors(colIx, vectors);
        } else {
          ecb.initColumn(0, OrcEncodedColumnBatch.MAX_DATA_STREAMS);
        }
      } else {
        ecb.initColumn(colIx, OrcEncodedColumnBatch.MAX_DATA_STREAMS);
        processColumnCacheData(cacheBuffers, ecb, colIx);
      }
    }
    if (processStop()) {
      recordReaderTime(startTime);
      return false;
    }
    return sendEcbToConsumer(ecb, cacheData != null, null);
  }


  private void processAsyncCacheData(CacheWriter.CacheStripeData diskData,
      boolean[] splitIncludes) throws IOException {
    StripeData sliceToCache = new StripeData(diskData.knownTornStart, diskData.firstRowStart,
        diskData.lastRowStart, diskData.lastRowEnd, diskData.rowCount,
        diskData.encodings.toArray(new ColumnEncoding[diskData.encodings.size()]));
    for (int colIx = 0; colIx < splitIncludes.length; ++colIx) {
      if (!splitIncludes[colIx]) continue;
      // The column has been read from disk.
      List<CacheWriter.CacheStreamData> streams = diskData.colStreams.get(colIx);
      LlapDataBuffer[][] newCacheDataForCol = createArrayToCache(sliceToCache, colIx, streams);
      if (streams == null) continue; // Struct column, such as root?
      Iterator<CacheWriter.CacheStreamData> iter = streams.iterator();
      while (iter.hasNext()) {
        CacheWriter.CacheStreamData stream = iter.next();
        if (stream.isSuppressed) {
          if (LlapIoImpl.LOG.isTraceEnabled()) {
            LlapIoImpl.LOG.trace("Removing a suppressed stream " + stream.name);
          }
          iter.remove();
          discardUncachedBuffers(stream.data);
          continue;
        }
        setStreamDataToCache(newCacheDataForCol, stream);
      }
    }
    if (LlapIoImpl.CACHE_LOGGER.isTraceEnabled()) {
      LlapIoImpl.CACHE_LOGGER.trace("Data to cache from async read " + sliceToCache);
    }
    try {
      cacheFileData(sliceToCache);
    } finally {
      unlockAllBuffers(sliceToCache);
    }
  }

  private StripeData createSliceToCache(
      CacheWriter.CacheStripeData diskData, StripeData cacheData) throws IOException {
    assert diskData != null;
    if (cacheData == null) {
      return new StripeData(diskData.knownTornStart, diskData.firstRowStart,
          diskData.lastRowStart, diskData.lastRowEnd, diskData.rowCount,
          diskData.encodings.toArray(new ColumnEncoding[diskData.encodings.size()]));
    } else {
      long rowCount = diskData.rowCount, encodingCount = diskData.encodings.size();
      validateCacheAndDisk(cacheData, rowCount, encodingCount, diskData);
      if (LlapIoImpl.LOG.isDebugEnabled()) {
        LlapIoImpl.LOG.debug("Creating slice to cache in addition to an existing slice "
          + cacheData.toCoordinateString() + "; disk offsets were "
          + diskData.toCoordinateString());
      }
      // Note: we could just do what we already do above from disk data, except for the validation
      // that is not strictly necessary, and knownTornStart which is an optimization.
      StripeData sliceToCache = StripeData.duplicateStructure(cacheData);
      for (int i = 0; i < diskData.encodings.size(); ++i) {
        sliceToCache.getEncodings()[i] = diskData.encodings.get(i);
      }
      sliceToCache.setKnownTornStart(Math.min(
          diskData.knownTornStart, sliceToCache.getKnownTornStart()));
      return sliceToCache;
    }
  }


  private static LlapDataBuffer[][] createArrayToCache(
      StripeData sliceToCache, int colIx, List<CacheWriter.CacheStreamData> streams) {
    if (LlapIoImpl.LOG.isTraceEnabled()) {
      LlapIoImpl.LOG.trace("Processing streams for column " + colIx + ": " + streams);
    }
    LlapDataBuffer[][] newCacheDataForCol = sliceToCache.getData()[colIx]
        = new LlapDataBuffer[OrcEncodedColumnBatch.MAX_DATA_STREAMS][];
    return newCacheDataForCol;
  }

  private static int setStreamDataToCache(
      LlapDataBuffer[][] newCacheDataForCol, CacheWriter.CacheStreamData stream) {
    int streamIx = stream.name.getKind().getNumber();
    // This is kinda hacky - we "know" these are LlapDataBuffer-s.
    newCacheDataForCol[streamIx] = stream.data.toArray(new LlapDataBuffer[stream.data.size()]);
    return streamIx;
  }

  private void processColumnCacheData(LlapDataBuffer[][][] cacheBuffers,
      OrcEncodedColumnBatch ecb, int colIx) {
    // The column has been obtained from cache.
    LlapDataBuffer[][] colData = cacheBuffers[colIx];
    if (LlapIoImpl.CACHE_LOGGER.isTraceEnabled()) {
      LlapIoImpl.CACHE_LOGGER.trace("Processing cache data for column " + colIx + ": "
          + SerDeLowLevelCacheImpl.toString(colData));
    }
    for (int streamIx = 0; streamIx < colData.length; ++streamIx) {
      if (colData[streamIx] == null) continue;
      ColumnStreamData cb = CSD_POOL.take();
      cb.incRef();
      cb.setCacheBuffers(Lists.<MemoryBuffer>newArrayList(colData[streamIx]));
      ecb.setStreamData(colIx, streamIx, cb);
    }
  }

  private void logProcessOneSlice(int stripeIx, Object diskData, StripeData cacheData) {
    String sliceStr = cacheData == null ? "null" : cacheData.toCoordinateString();
    if (LlapIoImpl.LOG.isDebugEnabled()) {
      LlapIoImpl.LOG.debug("Processing slice #" + stripeIx + " " + sliceStr + "; has"
        + ((cacheData == null) ? " no" : "") + " cache data; has"
        + ((diskData == null) ? " no" : "") + " disk data");
    }
  }

  private void discardUncachedBuffers(List<MemoryBuffer> list) {
    for (MemoryBuffer buffer : list) {
      boolean isInvalidated = ((LlapDataBuffer)buffer).invalidate();
      assert isInvalidated;
      bufferManager.getAllocator().deallocate(buffer);
    }
  }

  private static List<ColumnEncoding> combineCacheAndWriterEncodings(
      ColumnEncoding[] cacheEncodings, List<ColumnEncoding> writerEncodings) throws IOException {
    // TODO: refactor with cache impl? it has the same merge logic
    if (cacheEncodings == null) {
      return new ArrayList<>(writerEncodings);
    }
    if (cacheEncodings.length != writerEncodings.size()) {
      throw new IOException("Incompatible encoding lengths: "
          + Arrays.toString(cacheEncodings) + " vs " + writerEncodings);
    }
    ColumnEncoding[] combinedEncodings = Arrays.copyOf(cacheEncodings, cacheEncodings.length);
    for (int colIx = 0; colIx < cacheEncodings.length; ++colIx) {
      ColumnEncoding newEncoding = writerEncodings.get(colIx);
      if (newEncoding == null) continue;
      if (combinedEncodings[colIx] != null && !newEncoding.equals(combinedEncodings[colIx])) {
        throw new IOException("Incompatible encodings at " + colIx + ": "
            + Arrays.toString(cacheEncodings) + " vs " + writerEncodings);
      }
      combinedEncodings[colIx] = newEncoding;
    }
    return Lists.newArrayList(combinedEncodings);
  }

  private static class Vectors {
    private final List<ColumnVector>[] data;
    private final boolean isSupported;
    private final long rowCount;

    @SuppressWarnings("unchecked")
    public Vectors(List<VectorizedRowBatch> vrbs) {
      if (vrbs == null) {
        isSupported = false;
        data = null;
        rowCount = 0;
        return;
      }
      isSupported = true;
      if (vrbs.isEmpty()) {
        data = null;
        rowCount = 0;
        return;
      }
      data = new List[vrbs.get(0).numCols];
      for (int i = 0; i < data.length; ++i) {
        data[i] = new ArrayList<>(vrbs.size());
      }
      int rowCount = 0;
      for (VectorizedRowBatch vrb : vrbs) {
        assert !vrb.selectedInUse;
        rowCount += vrb.size;
        for (int i = 0; i < vrb.cols.length; ++i) {
          data[i].add(vrb.cols[i]);
        }
      }
      this.rowCount = rowCount;
    }

    public List<ColumnVector> getVectors(int ix) {
      return data[ix];
    }

    public long getRowCount() {
      return rowCount;
    }

    public boolean isSupported() {
      return isSupported;
    }

    @Override
    public String toString() {
      return "Vectors {isSupported=" + isSupported + ", rowCount=" + rowCount
          + ", data=" + Arrays.toString(data) + "}";
    }
  }

  /**
   * This class only exists because Java doesn't have yield return. The original method
   * before this change only needed yield return-s sprinkled here and there; however,
   * Java developers are usually paid by class, so here we go.
   */
  private static class FileReaderYieldReturn {
    private ReaderWithOffsets offsetReader;
    private int rowsPerSlice = 0;
    private long currentKnownTornStart;
    private long lastStartOffset = Long.MIN_VALUE, firstStartOffset = Long.MIN_VALUE;
    private boolean hasUnsplittableData = false;
    private final EncodingWriter writer;
    private final boolean maySplitTheSplit;
    private final int targetSliceRowCount;
    private final FileSplit split;

    public FileReaderYieldReturn(ReaderWithOffsets offsetReader, FileSplit split, EncodingWriter writer,
        boolean maySplitTheSplit, int targetSliceRowCount) {
      this.offsetReader = offsetReader;
      currentKnownTornStart = split.getStart();
      this.writer = writer;
      this.maySplitTheSplit = maySplitTheSplit;
      this.targetSliceRowCount = targetSliceRowCount;
      this.split = split;
    }

    public CacheWriter getCacheWriter() throws IOException {
      return writer.getCacheWriter();
    }

    public Vectors readNextSlice() throws IOException {
      if (offsetReader == null) return null;
      try {
        while (offsetReader.next()) {
          hasUnsplittableData = true;
          Writable value = offsetReader.getCurrentRow();
          lastStartOffset = offsetReader.getCurrentRowStartOffset();
          if (firstStartOffset == Long.MIN_VALUE) {
            firstStartOffset = lastStartOffset;
          }
          writer.writeOneRow(value);

          if (maySplitTheSplit && ++rowsPerSlice == targetSliceRowCount) {
            assert offsetReader.hasOffsets();
            writer.flushIntermediateData();
            long fileOffset = offsetReader.getCurrentRowEndOffset();
            // Must support offsets to be able to split.
            if (firstStartOffset < 0 || lastStartOffset < 0 || fileOffset < 0) {
              throw new AssertionError("Unable to get offsets from "
                  + offsetReader.getClass().getSimpleName());
            }
            writer.setCurrentStripeOffsets(
                currentKnownTornStart, firstStartOffset, lastStartOffset, fileOffset);
            writer.writeIntermediateFooter();

            // Split starting at row start will not read that row.
            currentKnownTornStart = lastStartOffset;
            // Row offsets will be determined from the reader (we could set the first from last).
            lastStartOffset = Long.MIN_VALUE;
            firstStartOffset = Long.MIN_VALUE;
            rowsPerSlice = 0;
            return new Vectors(writer.extractCurrentVrbs());
          }
        }
        try {
          Vectors result = null;
          if (rowsPerSlice > 0 || (!maySplitTheSplit && hasUnsplittableData)) {
            long fileOffset = -1;
            if (!offsetReader.hasOffsets()) {
              // The reader doesn't support offsets. We adjust offsets to match future splits.
              // If cached split was starting at row start, that row would be skipped, so +1
              firstStartOffset = split.getStart() + 1;
              // Last row starting at the end of the split would be read.
              lastStartOffset = split.getStart() + split.getLength();
              // However, it must end after the split end, otherwise the next one would have been read.
              fileOffset = lastStartOffset + 1;
              if (LlapIoImpl.CACHE_LOGGER.isDebugEnabled()) {
                LlapIoImpl.CACHE_LOGGER.debug("Cache offsets based on the split - 'first row' at "
                  + firstStartOffset + "; 'last row' at " + lastStartOffset + ", " + fileOffset);
              }
            } else {
              fileOffset = offsetReader.getCurrentRowEndOffset();
              assert firstStartOffset >= 0 && lastStartOffset >= 0 && fileOffset >= 0;
            }
            writer.setCurrentStripeOffsets(
                currentKnownTornStart, firstStartOffset, lastStartOffset, fileOffset);
            // Close the writer to finalize the metadata.
            writer.close();
            result = new Vectors(writer.extractCurrentVrbs());
          } else {
            writer.close();
          }
          return result;
        } finally {
          closeOffsetReader();
        }
      } catch (Exception ex) {
        closeOffsetReader();
        throw (ex instanceof IOException) ? (IOException)ex : new IOException(ex);
      }
    }

    private void closeOffsetReader() {
      if (offsetReader == null) return;
      try {
        offsetReader.close();
      } catch (Exception ex) {
        LlapIoImpl.LOG.error("Failed to close source reader", ex);
      }
      offsetReader = null;
    }
  }

  public void startReadSplitFromFile(
      FileSplit split, boolean[] splitIncludes, StripeData slice) throws IOException {
    boolean maySplitTheSplit = slice == null;
    ReaderWithOffsets offsetReader = null;
    @SuppressWarnings("rawtypes")
    RecordReader sourceReader = sourceInputFormat.getRecordReader(split, jobConf, reporter);
    try {
      offsetReader = createOffsetReader(sourceReader);
      sourceReader = null;
    } finally {
      if (sourceReader != null) {
        try {
          sourceReader.close();
        } catch (Exception ex) {
          LlapIoImpl.LOG.error("Failed to close source reader", ex);
        }
      }
    }
    maySplitTheSplit = maySplitTheSplit && offsetReader.hasOffsets();

    try {
      StructObjectInspector originalOi = (StructObjectInspector)getOiFromSerDe();
      List<Integer> splitColumnIds = OrcInputFormat.genIncludedColumnsReverse(
          schema, splitIncludes, false);
      // fileread writes to the writer, which writes to orcWriter, which writes to cacheWriter
      EncodingWriter writer = VectorDeserializeOrcWriter.create(
          sourceInputFormat, sourceSerDe, parts, daemonConf, jobConf, split.getPath(), originalOi,
          splitColumnIds, splitIncludes, allocSize);
      // TODO: move this into ctor? EW would need to create CacheWriter then
      List<Integer> cwColIds = writer.isOnlyWritingIncludedColumns() ? splitColumnIds : columnIds;
      writer.init(new CacheWriter(bufferManager, cwColIds, splitIncludes,
          writer.isOnlyWritingIncludedColumns()), daemonConf, split.getPath());
      if (writer instanceof VectorDeserializeOrcWriter) {
        VectorDeserializeOrcWriter asyncWriter = (VectorDeserializeOrcWriter)writer;
        asyncWriter.startAsync(new AsyncCacheDataCallback());
        this.asyncWriters.add(asyncWriter);
      }
      currentFileRead = new FileReaderYieldReturn(
          offsetReader, split, writer, maySplitTheSplit, targetSliceRowCount);
    } finally {
      // Assignment is the last thing in the try, so if it happen we assume success.
      if (currentFileRead != null) return;
      if (offsetReader == null) return;
      try {
        offsetReader.close();
      } catch (Exception ex) {
        LlapIoImpl.LOG.error("Failed to close source reader", ex);
      }
    }
  }

  private class AsyncCacheDataCallback implements AsyncCallback {
    @Override
    public void onComplete(VectorDeserializeOrcWriter writer) {
      CacheWriter cacheWriter = null;
      try {
        cacheWriter = writer.getCacheWriter();
        // What we were reading from disk originally.
        boolean[] cacheIncludes = writer.getOriginalCacheIncludes();
        Iterator<CacheWriter.CacheStripeData> iter = cacheWriter.stripes.iterator();
        while (iter.hasNext()) {
          processAsyncCacheData(iter.next(), cacheIncludes);
          iter.remove();
        }
      } catch (IOException e) {
        LlapIoImpl.LOG.error("Failed to cache async data", e);
      } finally {
        cacheWriter.discardData();
      }
    }
  }

  // TODO: this interface is ugly. The two implementations are so far apart feature-wise
  //       after all the perf changes that we might was well hardcode them separately.
  static abstract class EncodingWriter {
    protected Writer orcWriter;
    protected CacheWriter cacheWriter;
    protected final StructObjectInspector sourceOi;
    private final int allocSize;

    public EncodingWriter(StructObjectInspector sourceOi, int allocSize) {
      this.sourceOi = sourceOi;
      this.allocSize = allocSize;
    }


    public void init(CacheWriter cacheWriter, Configuration conf, Path path) throws IOException {
      this.orcWriter = createOrcWriter(cacheWriter, conf, path, sourceOi);
      this.cacheWriter = cacheWriter;
    }

    public CacheWriter getCacheWriter() {
      return cacheWriter;
    }
    public abstract boolean isOnlyWritingIncludedColumns();

    public abstract void writeOneRow(Writable row) throws IOException;
    public abstract void setCurrentStripeOffsets(long currentKnownTornStart,
        long firstStartOffset, long lastStartOffset, long fileOffset);
    public abstract void flushIntermediateData() throws IOException;
    public abstract void writeIntermediateFooter() throws IOException;
    public abstract List<VectorizedRowBatch> extractCurrentVrbs();
    public void close() throws IOException {
      if (orcWriter != null) {
        try {
          orcWriter.close();
          orcWriter = null;
        } catch (Exception ex) {
          LlapIoImpl.LOG.error("Failed to close ORC writer", ex);
        }
      }
      if (cacheWriter != null) {
        try {
          cacheWriter.discardData();
          cacheWriter = null;
        } catch (Exception ex) {
          LlapIoImpl.LOG.error("Failed to close cache writer", ex);
        }
      }
    }

    protected Writer createOrcWriter(CacheWriter cacheWriter, Configuration conf,
        Path path, StructObjectInspector oi) throws IOException {
      // TODO: this is currently broken. We need to set memory manager to a bogus implementation
      //       to avoid problems with memory manager actually tracking the usage.
      return OrcFile.createWriter(path, createOrcWriterOptions(oi, conf, cacheWriter, allocSize));
    }
  }

  static class DeserializerOrcWriter extends EncodingWriter {
    private final Deserializer sourceSerDe;

    public DeserializerOrcWriter(
        Deserializer sourceSerDe, StructObjectInspector sourceOi, int allocSize) {
      super(sourceOi, allocSize);
      this.sourceSerDe = sourceSerDe;
    }

    @Override
    public void close() throws IOException {
      orcWriter.close();
    }

    @Override
    public void writeOneRow(Writable value) throws IOException {
      Object row = null;
      try {
        row = sourceSerDe.deserialize(value);
      } catch (SerDeException e) {
        throw new IOException(e);
      }
      orcWriter.addRow(row);
    }

    @Override
    public void flushIntermediateData() {
      // No-op.
    }

    @Override
    public void writeIntermediateFooter() throws IOException {
      orcWriter.writeIntermediateFooter();
    }

    @Override
    public boolean isOnlyWritingIncludedColumns() {
      return false; // LazySimpleSerDe doesn't support projection.
    }

    @Override
    public void setCurrentStripeOffsets(long currentKnownTornStart,
        long firstStartOffset, long lastStartOffset, long fileOffset) {
      cacheWriter.setCurrentStripeOffsets(
          currentKnownTornStart, firstStartOffset, lastStartOffset, fileOffset);
    }

    @Override
    public List<VectorizedRowBatch> extractCurrentVrbs() {
      return null; // Doesn't support creating VRBs.
    }
  }

  private static final class NoopMemoryManager extends MemoryManager {
    public NoopMemoryManager() {
      super(null);
    }

    @Override
    public void addedRow(int rows) {}
    @Override
    public void addWriter(Path path, long requestedAllocation, Callback callback) {}
    @Override
    public void notifyWriters() {}
    @Override
    public void removeWriter(Path path) throws IOException {}
  }
  private static final NoopMemoryManager MEMORY_MANAGER = new NoopMemoryManager();

  static WriterOptions createOrcWriterOptions(ObjectInspector sourceOi,
      Configuration conf, CacheWriter cacheWriter, int allocSize) throws IOException {
    return OrcFile.writerOptions(conf).stripeSize(Long.MAX_VALUE).blockSize(Long.MAX_VALUE)
        .rowIndexStride(Integer.MAX_VALUE) // For now, do not limit this - one RG per split
        .blockPadding(false).compress(CompressionKind.NONE).version(Version.CURRENT)
        .encodingStrategy(EncodingStrategy.SPEED).bloomFilterColumns(null).inspector(sourceOi)
        .physicalWriter(cacheWriter).memory(MEMORY_MANAGER).bufferSize(allocSize);
  }

  private ObjectInspector getOiFromSerDe() throws IOException {
    try {
      return sourceSerDe.getObjectInspector();
    } catch (SerDeException e) {
      throw new IOException(e);
    }
  }

  private ReaderWithOffsets createOffsetReader(RecordReader<?, ?> sourceReader) {
    if (LlapIoImpl.LOG.isDebugEnabled()) {
      LlapIoImpl.LOG.debug("Using " + sourceReader.getClass().getSimpleName() + " to read data");
    }
    // Handle the special cases here. Perhaps we could have a more general structure, or even
    // a configurable set (like storage handlers), but for now we only have one.
    if (isLrrEnabled && sourceReader instanceof LineRecordReader) {
      return LineRrOffsetReader.create((LineRecordReader)sourceReader);
    }
    return new PassThruOffsetReader(sourceReader);
  }

  private static String[] extractHosts(FileSplit split, boolean isInMemory) throws IOException {
    SplitLocationInfo[] locInfo = split.getLocationInfo();
    if (locInfo == null) return new String[0];
    List<String> hosts = null; // TODO: most of the time, there's no in-memory. Use an array?
    for (int i = 0; i < locInfo.length; i++) {
      if (locInfo[i].isInMemory() != isInMemory) continue;
      if (hosts == null) {
        hosts = new ArrayList<>();
      }
      hosts.add(locInfo[i].getLocation());
    }
    if (hosts == null) return new String[0];
    return hosts.toArray(new String[hosts.size()]);
  }

  private boolean sendEcbToConsumer(OrcEncodedColumnBatch ecb,
      boolean hasCachedSlice, CacheWriter.CacheStripeData diskData) {
    if (ecb == null) { // This basically means stop has been called.
      cleanup(true);
      return false;
    }
    LlapIoImpl.LOG.trace("Sending a batch over to consumer");
    consumer.consumeData(ecb);
    if (hasCachedSlice) {
      cachedData.getData().remove(0); // See javadoc - no need to clean up the cache data anymore.
    }
    if (diskData != null) {
      diskData.colStreams.clear();
    }
    return true;
  }

  private void cleanup(boolean isError) {
    cleanUpCurrentRead();
    if (!isError) return;
    for (VectorDeserializeOrcWriter asyncWriter : asyncWriters) {
      try {
        asyncWriter.interrupt();
      } catch (Exception ex) {
        LlapIoImpl.LOG.warn("Failed to interrupt an async writer", ex);
      }
    }
    asyncWriters.clear();
  }

  private void cleanUpCurrentRead() {
    if (currentFileRead == null) return;
    try {
      currentFileRead.closeOffsetReader();
      currentFileRead = null;
    } catch (Exception ex) {
      LlapIoImpl.LOG.error("Failed to close current file reader", ex);
    }
  }

  private void recordReaderTime(long startTime) {
    counters.incrTimeCounter(LlapIOCounters.TOTAL_IO_TIME_NS, startTime);
  }

  private boolean processStop() {
    if (!isStopped) return false;
    LlapIoImpl.LOG.info("SerDe-based data reader is stopping");
    cleanup(true);
    return true;
  }

  private static Object determineFileId(FileSystem fs, FileSplit split,
      boolean allowSynthetic) throws IOException {
    /* TODO: support this optionally? this is not OrcSplit, but we could add a custom split.
      Object fileKey = ((OrcSplit)split).getFileKey();
      if (fileKey != null) return fileKey; */
    LlapIoImpl.LOG.warn("Split for " + split.getPath() + " (" + split.getClass() + ") does not have file ID");
    return HdfsUtils.getFileId(fs, split.getPath(), allowSynthetic);
  }

  @Override
  public void returnData(OrcEncodedColumnBatch ecb) {
    for (int colIx = 0; colIx < ecb.getTotalColCount(); ++colIx) {
      if (!ecb.hasData(colIx)) continue;
      // TODO: reuse columnvector-s on hasBatch - save the array by column? take apart each list.
      ColumnStreamData[] datas = ecb.getColumnData(colIx);
      for (ColumnStreamData data : datas) {
        if (data == null || data.decRef() != 0) continue;
        if (LlapIoImpl.LOCKING_LOGGER.isTraceEnabled()) {
          for (MemoryBuffer buf : data.getCacheBuffers()) {
            LlapIoImpl.LOCKING_LOGGER.trace("Unlocking {} at the end of processing", buf);
          }
        }
        bufferManager.decRefBuffers(data.getCacheBuffers());
        CSD_POOL.offer(data);
      }
    }
    // We can offer ECB even with some streams not discarded; reset() will clear the arrays.
    ECB_POOL.offer(ecb);
  }

  @Override
  public TezCounters getTezCounters() {
    return counters.getTezCounters();
  }
}