VectorizedOrcAcidRowBatchReader.java example

Explorer
hive-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.io.orc;

import java.io.IOException;
import java.util.Arrays;
import java.util.BitSet;
import java.util.List;
import java.util.Map.Entry;
import java.util.TreeMap;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.ValidReadTxnList;
import org.apache.hadoop.hive.common.ValidTxnList;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx;
import org.apache.hadoop.hive.ql.io.AcidUtils;
import org.apache.hadoop.hive.ql.io.RecordIdentifier;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Reporter;
import org.apache.orc.OrcProto;
import org.apache.orc.OrcUtils;
import org.apache.orc.TypeDescription;
import org.apache.orc.impl.AcidStats;
import org.apache.orc.impl.OrcAcidUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.annotations.VisibleForTesting;
/**
 * A fast vectorized batch reader class for ACID when split-update behavior is enabled.
 * When split-update is turned on, row-by-row stitching could be avoided to create the final
 * version of a row. Essentially, there are only insert and delete events. Insert events can be
 * directly read from the base files/insert_only deltas in vectorized row batches. The deleted
 * rows can then be easily indicated via the 'selected' field of the vectorized row batch.
 * Refer HIVE-14233 for more details.
 */
public class VectorizedOrcAcidRowBatchReader
    implements org.apache.hadoop.mapred.RecordReader<NullWritable,VectorizedRowBatch> {

  private static final Logger LOG = LoggerFactory.getLogger(VectorizedOrcAcidRowBatchReader.class);

  private org.apache.hadoop.hive.ql.io.orc.RecordReader baseReader;
  private VectorizedRowBatchCtx rbCtx;
  private VectorizedRowBatch vectorizedRowBatchBase;
  private long offset;
  private long length;
  private float progress = 0.0f;
  private Object[] partitionValues;
  private boolean addPartitionCols = true;
  private ValidTxnList validTxnList;
  private DeleteEventRegistry deleteEventRegistry;

  public VectorizedOrcAcidRowBatchReader(InputSplit inputSplit, JobConf conf,
        Reporter reporter) throws IOException {

    final boolean isAcidRead = HiveConf.getBoolVar(conf, ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN);
    final AcidUtils.AcidOperationalProperties acidOperationalProperties
            = AcidUtils.getAcidOperationalProperties(conf);

    // This type of VectorizedOrcAcidRowBatchReader can only be created when split-update is
    // enabled for an ACID case and the file format is ORC.
    boolean isReadNotAllowed = !isAcidRead || !acidOperationalProperties.isSplitUpdate()
                                   || !(inputSplit instanceof OrcSplit);
    if (isReadNotAllowed) {
      OrcInputFormat.raiseAcidTablesMustBeReadWithAcidReaderException(conf);
    }
    final OrcSplit orcSplit = (OrcSplit) inputSplit;

    rbCtx = Utilities.getVectorizedRowBatchCtx(conf);

    reporter.setStatus(orcSplit.toString());
    Reader reader = OrcInputFormat.createOrcReaderForSplit(conf, orcSplit);
    Reader.Options readerOptions = OrcInputFormat.createOptionsForReader(conf);
    readerOptions = OrcRawRecordMerger.createEventOptions(readerOptions);

    this.offset = orcSplit.getStart();
    this.length = orcSplit.getLength();

    // Careful with the range here now, we do not want to read the whole base file like deltas.
    this.baseReader = reader.rowsOptions(readerOptions.range(offset, length));

    // VectorizedRowBatchBase schema is picked up from the baseReader because the SchemaEvolution
    // stuff happens at the ORC layer that understands how to map user schema to acid schema.
    if (this.baseReader instanceof RecordReaderImpl) {
      this.vectorizedRowBatchBase = ((RecordReaderImpl) this.baseReader).createRowBatch();
    } else {
      throw new IOException("Failed to create vectorized row batch for the reader of type "
          + this.baseReader.getClass().getName());
    }

    int partitionColumnCount = (rbCtx != null) ? rbCtx.getPartitionColumnCount() : 0;
    if (partitionColumnCount > 0) {
      partitionValues = new Object[partitionColumnCount];
      VectorizedRowBatchCtx.getPartitionValues(rbCtx, conf, orcSplit, partitionValues);
    } else {
      partitionValues = null;
    }

    String txnString = conf.get(ValidTxnList.VALID_TXNS_KEY);
    this.validTxnList = (txnString == null) ? new ValidReadTxnList() : new ValidReadTxnList(txnString);

    // Clone readerOptions for deleteEvents.
    Reader.Options deleteEventReaderOptions = readerOptions.clone();
    // Set the range on the deleteEventReaderOptions to 0 to INTEGER_MAX because
    // we always want to read all the delete delta files.
    deleteEventReaderOptions.range(0, Long.MAX_VALUE);
    //  Disable SARGs for deleteEventReaders, as SARGs have no meaning.
    deleteEventReaderOptions.searchArgument(null, null);
    try {
      // See if we can load all the delete events from all the delete deltas in memory...
      this.deleteEventRegistry = new ColumnizedDeleteEventRegistry(conf, orcSplit, deleteEventReaderOptions);
    } catch (DeleteEventsOverflowMemoryException e) {
      // If not, then create a set of hanging readers that do sort-merge to find the next smallest
      // delete event on-demand. Caps the memory consumption to (some_const * no. of readers).
      this.deleteEventRegistry = new SortMergedDeleteEventRegistry(conf, orcSplit, deleteEventReaderOptions);
    }
  }

  /**
   * Returns whether it is possible to create a valid instance of this class for a given split.
   * @param conf is the job configuration
   * @param inputSplit
   * @return true if it is possible, else false.
   */
  public static boolean canCreateVectorizedAcidRowBatchReaderOnSplit(JobConf conf, InputSplit inputSplit) {
    if (!(inputSplit instanceof OrcSplit)) {
      return false; // must be an instance of OrcSplit.
    }
    // First check if we are reading any original files in the split.
    // To simplify the vectorization logic, the vectorized acid row batch reader does not handle
    // original files for now as they have a different schema than a regular ACID file.
    final OrcSplit split = (OrcSplit) inputSplit;
    if (AcidUtils.getAcidOperationalProperties(conf).isSplitUpdate() && !split.isOriginal()) {
      // When split-update is turned on for ACID, a more optimized vectorized batch reader
      // can be created. But still only possible when we are *NOT* reading any originals.
      return true;
    }
    return false; // no split-update or possibly reading originals!
  }

  private static Path[] getDeleteDeltaDirsFromSplit(OrcSplit orcSplit) throws IOException {
    Path path = orcSplit.getPath();
    Path root;
    if (orcSplit.hasBase()) {
      if (orcSplit.isOriginal()) {
        root = path.getParent();
      } else {
        root = path.getParent().getParent();
      }
    } else {
      root = path;
    }
    return AcidUtils.deserializeDeleteDeltas(root, orcSplit.getDeltas());
  }

  @Override
  public boolean next(NullWritable key, VectorizedRowBatch value) throws IOException {
    try {
      // Check and update partition cols if necessary. Ideally, this should be done
      // in CreateValue as the partition is constant per split. But since Hive uses
      // CombineHiveRecordReader and
      // as this does not call CreateValue for each new RecordReader it creates, this check is
      // required in next()
      if (addPartitionCols) {
        if (partitionValues != null) {
          rbCtx.addPartitionColsToBatch(value, partitionValues);
        }
        addPartitionCols = false;
      }
      if (!baseReader.nextBatch(vectorizedRowBatchBase)) {
        return false;
      }
    } catch (Exception e) {
      throw new IOException("error iterating", e);
    }

    // Once we have read the VectorizedRowBatchBase from the file, there are two kinds of cases
    // for which we might have to discard rows from the batch:
    // Case 1- when the row is created by a transaction that is not valid, or
    // Case 2- when the row has been deleted.
    // We will go through the batch to discover rows which match any of the cases and specifically
    // remove them from the selected vector. Of course, selectedInUse should also be set.

    BitSet selectedBitSet = new BitSet(vectorizedRowBatchBase.size);
    if (vectorizedRowBatchBase.selectedInUse) {
      // When selectedInUse is true, start with every bit set to false and selectively set
      // certain bits to true based on the selected[] vector.
      selectedBitSet.set(0, vectorizedRowBatchBase.size, false);
      for (int j = 0; j < vectorizedRowBatchBase.size; ++j) {
        int i = vectorizedRowBatchBase.selected[j];
        selectedBitSet.set(i);
      }
    } else {
      // When selectedInUse is set to false, everything in the batch is selected.
      selectedBitSet.set(0, vectorizedRowBatchBase.size, true);
    }

    // Case 1- find rows which belong to transactions that are not valid.
    findRecordsWithInvalidTransactionIds(vectorizedRowBatchBase, selectedBitSet);

    // Case 2- find rows which have been deleted.
    this.deleteEventRegistry.findDeletedRecords(vectorizedRowBatchBase, selectedBitSet);

    if (selectedBitSet.cardinality() == vectorizedRowBatchBase.size) {
      // None of the cases above matched and everything is selected. Hence, we will use the
      // same values for the selected and selectedInUse.
      value.size = vectorizedRowBatchBase.size;
      value.selected = vectorizedRowBatchBase.selected;
      value.selectedInUse = vectorizedRowBatchBase.selectedInUse;
    } else {
      value.size = selectedBitSet.cardinality();
      value.selectedInUse = true;
      value.selected = new int[selectedBitSet.cardinality()];
      // This loop fills up the selected[] vector with all the index positions that are selected.
      for (int setBitIndex = selectedBitSet.nextSetBit(0), selectedItr = 0;
           setBitIndex >= 0;
           setBitIndex = selectedBitSet.nextSetBit(setBitIndex+1), ++selectedItr) {
        value.selected[selectedItr] = setBitIndex;
      }
    }

    // Finally, link up the columnVector from the base VectorizedRowBatch to outgoing batch.
    // NOTE: We only link up the user columns and not the ACID metadata columns because this
    // vectorized code path is not being used in cases of update/delete, when the metadata columns
    // would be expected to be passed up the operator pipeline. This is because
    // currently the update/delete specifically disable vectorized code paths.
    // This happens at ql/exec/Utilities.java::3293 when it checks for mapWork.getVectorMode()
    StructColumnVector payloadStruct = (StructColumnVector) vectorizedRowBatchBase.cols[OrcRecordUpdater.ROW];
    // Transfer columnVector objects from base batch to outgoing batch.
    System.arraycopy(payloadStruct.fields, 0, value.cols, 0, value.getDataColumnCount());
    progress = baseReader.getProgress();
    return true;
  }

  private void findRecordsWithInvalidTransactionIds(VectorizedRowBatch batch, BitSet selectedBitSet) {
    if (batch.cols[OrcRecordUpdater.CURRENT_TRANSACTION].isRepeating) {
      // When we have repeating values, we can unset the whole bitset at once
      // if the repeating value is not a valid transaction.
      long currentTransactionIdForBatch = ((LongColumnVector)
          batch.cols[OrcRecordUpdater.CURRENT_TRANSACTION]).vector[0];
      if (!validTxnList.isTxnValid(currentTransactionIdForBatch)) {
        selectedBitSet.clear(0, batch.size);
      }
      return;
    }
    long[] currentTransactionVector =
        ((LongColumnVector) batch.cols[OrcRecordUpdater.CURRENT_TRANSACTION]).vector;
    // Loop through the bits that are set to true and mark those rows as false, if their
    // current transactions are not valid.
    for (int setBitIndex = selectedBitSet.nextSetBit(0);
        setBitIndex >= 0;
        setBitIndex = selectedBitSet.nextSetBit(setBitIndex+1)) {
      if (!validTxnList.isTxnValid(currentTransactionVector[setBitIndex])) {
        selectedBitSet.clear(setBitIndex);
      }
   }
  }

  @Override
  public NullWritable createKey() {
    return NullWritable.get();
  }

  @Override
  public VectorizedRowBatch createValue() {
    return rbCtx.createVectorizedRowBatch();
  }

  @Override
  public long getPos() throws IOException {
    return offset + (long) (progress * length);
  }

  @Override
  public void close() throws IOException {
    try {
      this.baseReader.close();
    } finally {
      this.deleteEventRegistry.close();
    }
  }

  @Override
  public float getProgress() throws IOException {
    return progress;
  }

  @VisibleForTesting
  DeleteEventRegistry getDeleteEventRegistry() {
    return deleteEventRegistry;
  }

  /**
   * An interface that can determine which rows have been deleted
   * from a given vectorized row batch. Implementations of this interface
   * will read the delete delta files and will create their own internal
   * data structures to maintain record ids of the records that got deleted.
   */
  static interface DeleteEventRegistry {
    /**
     * Modifies the passed bitset to indicate which of the rows in the batch
     * have been deleted. Assumes that the batch.size is equal to bitset size.
     * @param batch
     * @param selectedBitSet
     * @throws IOException
     */
    public void findDeletedRecords(VectorizedRowBatch batch, BitSet selectedBitSet) throws IOException;

    /**
     * The close() method can be called externally to signal the implementing classes
     * to free up resources.
     * @throws IOException
     */
    public void close() throws IOException;
  }

  /**
   * An implementation for DeleteEventRegistry that opens the delete delta files all
   * at once, and then uses the sort-merge algorithm to maintain a sorted list of
   * delete events. This internally uses the OrcRawRecordMerger and maintains a constant
   * amount of memory usage, given the number of delete delta files. Therefore, this
   * implementation will be picked up when the memory pressure is high.
   */
  static class SortMergedDeleteEventRegistry implements DeleteEventRegistry {
    private OrcRawRecordMerger deleteRecords;
    private OrcRawRecordMerger.ReaderKey deleteRecordKey;
    private OrcStruct deleteRecordValue;
    private boolean isDeleteRecordAvailable = true;
    private ValidTxnList validTxnList;

    public SortMergedDeleteEventRegistry(JobConf conf, OrcSplit orcSplit, Reader.Options readerOptions)
      throws IOException {
        final Path[] deleteDeltas = getDeleteDeltaDirsFromSplit(orcSplit);
        if (deleteDeltas.length > 0) {
          int bucket = AcidUtils.parseBaseOrDeltaBucketFilename(orcSplit.getPath(), conf).getBucket();
          String txnString = conf.get(ValidTxnList.VALID_TXNS_KEY);
          this.validTxnList = (txnString == null) ? new ValidReadTxnList() : new ValidReadTxnList(txnString);
          this.deleteRecords = new OrcRawRecordMerger(conf, true, null, false, bucket,
                                                      validTxnList, readerOptions, deleteDeltas);
          this.deleteRecordKey = new OrcRawRecordMerger.ReaderKey();
          this.deleteRecordValue = this.deleteRecords.createValue();
          // Initialize the first value in the delete reader.
          this.isDeleteRecordAvailable = this.deleteRecords.next(deleteRecordKey, deleteRecordValue);
        } else {
          this.isDeleteRecordAvailable = false;
          this.deleteRecordKey = null;
          this.deleteRecordValue = null;
          this.deleteRecords = null;
        }
    }

    @Override
    public void findDeletedRecords(VectorizedRowBatch batch, BitSet selectedBitSet)
        throws IOException {
      if (!isDeleteRecordAvailable) {
        return;
      }

      long[] originalTransaction =
          batch.cols[OrcRecordUpdater.ORIGINAL_TRANSACTION].isRepeating ? null
              : ((LongColumnVector) batch.cols[OrcRecordUpdater.ORIGINAL_TRANSACTION]).vector;
      long[] bucket =
          batch.cols[OrcRecordUpdater.BUCKET].isRepeating ? null
              : ((LongColumnVector) batch.cols[OrcRecordUpdater.BUCKET]).vector;
      long[] rowId =
          batch.cols[OrcRecordUpdater.ROW_ID].isRepeating ? null
              : ((LongColumnVector) batch.cols[OrcRecordUpdater.ROW_ID]).vector;

      // The following repeatedX values will be set, if any of the columns are repeating.
      long repeatedOriginalTransaction = (originalTransaction != null) ? -1
          : ((LongColumnVector) batch.cols[OrcRecordUpdater.ORIGINAL_TRANSACTION]).vector[0];
      long repeatedBucket = (bucket != null) ? -1
          : ((LongColumnVector) batch.cols[OrcRecordUpdater.BUCKET]).vector[0];
      long repeatedRowId = (rowId != null) ? -1
          : ((LongColumnVector) batch.cols[OrcRecordUpdater.ROW_ID]).vector[0];


      // Get the first valid row in the batch still available.
      int firstValidIndex = selectedBitSet.nextSetBit(0);
      if (firstValidIndex == -1) {
        return; // Everything in the batch has already been filtered out.
      }
      RecordIdentifier firstRecordIdInBatch =
          new RecordIdentifier(
              originalTransaction != null ? originalTransaction[firstValidIndex] : repeatedOriginalTransaction,
              bucket != null ? (int) bucket[firstValidIndex] : (int) repeatedBucket,
              rowId != null ? (int)  rowId[firstValidIndex] : repeatedRowId);

      // Get the last valid row in the batch still available.
      int lastValidIndex = selectedBitSet.previousSetBit(batch.size - 1);
      RecordIdentifier lastRecordIdInBatch =
          new RecordIdentifier(
              originalTransaction != null ? originalTransaction[lastValidIndex] : repeatedOriginalTransaction,
              bucket != null ? (int) bucket[lastValidIndex] : (int) repeatedBucket,
              rowId != null ? (int)  rowId[lastValidIndex] : repeatedRowId);

      // We must iterate over all the delete records, until we find one record with
      // deleteRecord >= firstRecordInBatch or until we exhaust all the delete records.
      while (deleteRecordKey.compareRow(firstRecordIdInBatch) == -1) {
        isDeleteRecordAvailable = deleteRecords.next(deleteRecordKey, deleteRecordValue);
        if (!isDeleteRecordAvailable) return; // exhausted all delete records, return.
      }

      // If we are here, then we have established that firstRecordInBatch <= deleteRecord.
      // Now continue marking records which have been deleted until we reach the end of the batch
      // or we exhaust all the delete records.

      int currIndex = firstValidIndex;
      RecordIdentifier currRecordIdInBatch = new RecordIdentifier();
      while (isDeleteRecordAvailable && currIndex != -1 && currIndex <= lastValidIndex) {
        currRecordIdInBatch.setValues(
            (originalTransaction != null) ? originalTransaction[currIndex] : repeatedOriginalTransaction,
            (bucket != null) ? (int) bucket[currIndex] : (int) repeatedBucket,
            (rowId != null) ? rowId[currIndex] : repeatedRowId);

        if (deleteRecordKey.compareRow(currRecordIdInBatch) == 0) {
          // When deleteRecordId == currRecordIdInBatch, this record in the batch has been deleted.
          selectedBitSet.clear(currIndex);
          currIndex = selectedBitSet.nextSetBit(currIndex + 1); // Move to next valid index.
        } else if (deleteRecordKey.compareRow(currRecordIdInBatch) == 1) {
          // When deleteRecordId > currRecordIdInBatch, we have to move on to look at the
          // next record in the batch.
          // But before that, can we short-circuit and skip the entire batch itself
          // by checking if the deleteRecordId > lastRecordInBatch?
          if (deleteRecordKey.compareRow(lastRecordIdInBatch) == 1) {
            return; // Yay! We short-circuited, skip everything remaining in the batch and return.
          }
          currIndex = selectedBitSet.nextSetBit(currIndex + 1); // Move to next valid index.
        } else {
          // We have deleteRecordId < currRecordIdInBatch, we must now move on to find
          // next the larger deleteRecordId that can possibly match anything in the batch.
          isDeleteRecordAvailable = deleteRecords.next(deleteRecordKey, deleteRecordValue);
        }
      }
    }

    @Override
    public void close() throws IOException {
      if (this.deleteRecords != null) {
        this.deleteRecords.close();
      }
    }
  }

  /**
   * An implementation for DeleteEventRegistry that optimizes for performance by loading
   * all the delete events into memory at once from all the delete delta files.
   * It starts by reading all the delete events through a regular sort merge logic
   * into two vectors- one for original transaction id (otid), and the other for row id.
   * (In the current version, since the bucket id should be same for all the delete deltas,
   * it is not stored). The otids are likely to be repeated very often, as a single transaction
   * often deletes thousands of rows. Hence, the otid vector is compressed to only store the
   * toIndex and fromIndex ranges in the larger row id vector. Now, querying whether a
   * record id is deleted or not, is done by performing a binary search on the
   * compressed otid range. If a match is found, then a binary search is then performed on
   * the larger rowId vector between the given toIndex and fromIndex. Of course, there is rough
   * heuristic that prevents creation of an instance of this class if the memory pressure is high.
   * The SortMergedDeleteEventRegistry is then the fallback method for such scenarios.
   */
   static class ColumnizedDeleteEventRegistry implements DeleteEventRegistry {
    /**
     * A simple wrapper class to hold the (otid, rowId) pair.
     */
    static class DeleteRecordKey implements Comparable<DeleteRecordKey> {
      private long originalTransactionId;
      private long rowId;
      public DeleteRecordKey() {
        this.originalTransactionId = -1;
        this.rowId = -1;
      }
      public DeleteRecordKey(long otid, long rowId) {
        this.originalTransactionId = otid;
        this.rowId = rowId;
      }
      public void set(long otid, long rowId) {
        this.originalTransactionId = otid;
        this.rowId = rowId;
      }

      @Override
      public int compareTo(DeleteRecordKey other) {
        if (other == null) {
          return -1;
        }
        if (originalTransactionId != other.originalTransactionId) {
          return originalTransactionId < other.originalTransactionId ? -1 : 1;
        }
        if (rowId != other.rowId) {
          return rowId < other.rowId ? -1 : 1;
        }
        return 0;
      }
    }

    /**
     * This class actually reads the delete delta files in vectorized row batches.
     * For every call to next(), it returns the next smallest record id in the file if available.
     * Internally, the next() buffers a row batch and maintains an index pointer, reading the
     * next batch when the previous batch is exhausted.
     */
    static class DeleteReaderValue {
      private VectorizedRowBatch batch;
      private final RecordReader recordReader;
      private int indexPtrInBatch;
      private final int bucketForSplit; // The bucket value should be same for all the records.
      private final ValidTxnList validTxnList;

      public DeleteReaderValue(Reader deleteDeltaReader, Reader.Options readerOptions, int bucket,
          ValidTxnList validTxnList) throws IOException {
        this.recordReader  = deleteDeltaReader.rowsOptions(readerOptions);
        this.bucketForSplit = bucket;
        this.batch = deleteDeltaReader.getSchema().createRowBatch();
        if (!recordReader.nextBatch(batch)) { // Read the first batch.
          this.batch = null; // Oh! the first batch itself was null. Close the reader.
        }
        this.indexPtrInBatch = 0;
        this.validTxnList = validTxnList;
      }

      public boolean next(DeleteRecordKey deleteRecordKey) throws IOException {
        if (batch == null) {
          return false;
        }
        boolean isValidNext = false;
        while (!isValidNext) {
          if (indexPtrInBatch >= batch.size) {
            // We have exhausted our current batch, read the next batch.
            if (recordReader.nextBatch(batch)) {
              // Whenever we are reading a batch, we must ensure that all the records in the batch
              // have the same bucket id as the bucket id of the split. If not, throw exception.
              // NOTE: this assertion might not hold, once virtual bucketing is in place. However,
              // it should be simple to fix that case. Just replace check for bucket equality with
              // a check for valid bucket mapping. Until virtual bucketing is added, it means
              // either the split computation got messed up or we found some corrupted records.
              long bucketForRecord = ((LongColumnVector) batch.cols[OrcRecordUpdater.BUCKET]).vector[0];
              if ((batch.size > 1 && !batch.cols[OrcRecordUpdater.BUCKET].isRepeating)
                  || (bucketForRecord != bucketForSplit)){
                throw new IOException("Corrupted records with different bucket ids "
                    + "from the containing bucket file found! Expected bucket id "
                    + bucketForSplit + ", however found the bucket id " + bucketForRecord);
              }
              indexPtrInBatch = 0; // After reading the batch, reset the pointer to beginning.
            } else {
              return false; // no more batches to read, exhausted the reader.
            }
          }
          int originalTransactionIndex =
              batch.cols[OrcRecordUpdater.ORIGINAL_TRANSACTION].isRepeating ? 0 : indexPtrInBatch;
          long originalTransaction =
              ((LongColumnVector) batch.cols[OrcRecordUpdater.ORIGINAL_TRANSACTION]).vector[originalTransactionIndex];
          long rowId = ((LongColumnVector) batch.cols[OrcRecordUpdater.ROW_ID]).vector[indexPtrInBatch];
          int currentTransactionIndex =
              batch.cols[OrcRecordUpdater.CURRENT_TRANSACTION].isRepeating ? 0 : indexPtrInBatch;
          long currentTransaction =
              ((LongColumnVector) batch.cols[OrcRecordUpdater.CURRENT_TRANSACTION]).vector[currentTransactionIndex];
          ++indexPtrInBatch;
          if (validTxnList.isTxnValid(currentTransaction)) {
            isValidNext = true;
            deleteRecordKey.set(originalTransaction, rowId);
          }
        }
        return true;
      }

      public void close() throws IOException {
        this.recordReader.close();
      }
    }

    /**
     * A CompressedOtid class stores a compressed representation of the original
     * transaction ids (otids) read from the delete delta files. Since the record ids
     * are sorted by (otid, rowId) and otids are highly likely to be repetitive, it is
     * efficient to compress them as a CompressedOtid that stores the fromIndex and
     * the toIndex. These fromIndex and toIndex reference the larger vector formed by
     * concatenating the correspondingly ordered rowIds.
     */
    private class CompressedOtid implements Comparable<CompressedOtid> {
      long originalTransactionId;
      int fromIndex; // inclusive
      int toIndex; // exclusive

      public CompressedOtid(long otid, int fromIndex, int toIndex) {
        this.originalTransactionId = otid;
        this.fromIndex = fromIndex;
        this.toIndex = toIndex;
      }

      @Override
      public int compareTo(CompressedOtid other) {
        // When comparing the CompressedOtid, the one with the lesser value is smaller.
        if (originalTransactionId != other.originalTransactionId) {
          return originalTransactionId < other.originalTransactionId ? -1 : 1;
        }
        return 0;
      }
    }

    private TreeMap<DeleteRecordKey, DeleteReaderValue> sortMerger;
    private long rowIds[];
    private CompressedOtid compressedOtids[];
    private ValidTxnList validTxnList;

    public ColumnizedDeleteEventRegistry(JobConf conf, OrcSplit orcSplit,
        Reader.Options readerOptions) throws IOException, DeleteEventsOverflowMemoryException {
      int bucket = AcidUtils.parseBaseOrDeltaBucketFilename(orcSplit.getPath(), conf).getBucket();
      String txnString = conf.get(ValidTxnList.VALID_TXNS_KEY);
      this.validTxnList = (txnString == null) ? new ValidReadTxnList() : new ValidReadTxnList(txnString);
      this.sortMerger = new TreeMap<DeleteRecordKey, DeleteReaderValue>();
      this.rowIds = null;
      this.compressedOtids = null;
      int maxEventsInMemory = HiveConf.getIntVar(conf, ConfVars.HIVE_TRANSACTIONAL_NUM_EVENTS_IN_MEMORY);

      try {
        final Path[] deleteDeltaDirs = getDeleteDeltaDirsFromSplit(orcSplit);
        if (deleteDeltaDirs.length > 0) {
          int totalDeleteEventCount = 0;
          for (Path deleteDeltaDir : deleteDeltaDirs) {
            Path deleteDeltaFile = AcidUtils.createBucketFile(deleteDeltaDir, bucket);
            FileSystem fs = deleteDeltaFile.getFileSystem(conf);
            // NOTE: Calling last flush length below is more for future-proofing when we have
            // streaming deletes. But currently we don't support streaming deletes, and this can
            // be removed if this becomes a performance issue.
            long length = OrcAcidUtils.getLastFlushLength(fs, deleteDeltaFile);
            // NOTE: A check for existence of deleteDeltaFile is required because we may not have
            // deletes for the bucket being taken into consideration for this split processing.
            if (length != -1 && fs.exists(deleteDeltaFile)) {
              Reader deleteDeltaReader = OrcFile.createReader(deleteDeltaFile,
                  OrcFile.readerOptions(conf).maxLength(length));
              AcidStats acidStats = OrcAcidUtils.parseAcidStats(deleteDeltaReader);
              if (acidStats.deletes == 0) {
                continue; // just a safe check to ensure that we are not reading empty delete files.
              }
              totalDeleteEventCount += acidStats.deletes;
              if (totalDeleteEventCount > maxEventsInMemory) {
                // ColumnizedDeleteEventRegistry loads all the delete events from all the delete deltas
                // into memory. To prevent out-of-memory errors, this check is a rough heuristic that
                // prevents creation of an object of this class if the total number of delete events
                // exceed this value. By default, it has been set to 10 million delete events per bucket.
                LOG.info("Total number of delete events exceeds the maximum number of delete events "
                    + "that can be loaded into memory for the delete deltas in the directory at : "
                    + deleteDeltaDirs.toString() +". The max limit is currently set at "
                    + maxEventsInMemory + " and can be changed by setting the Hive config variable "
                    + ConfVars.HIVE_TRANSACTIONAL_NUM_EVENTS_IN_MEMORY.varname);
                throw new DeleteEventsOverflowMemoryException();
              }
              DeleteReaderValue deleteReaderValue = new DeleteReaderValue(deleteDeltaReader,
                  readerOptions, bucket, validTxnList);
              DeleteRecordKey deleteRecordKey = new DeleteRecordKey();
              if (deleteReaderValue.next(deleteRecordKey)) {
                sortMerger.put(deleteRecordKey, deleteReaderValue);
              } else {
                deleteReaderValue.close();
              }
            }
          }
          if (totalDeleteEventCount > 0) {
            // Initialize the rowId array when we have some delete events.
            rowIds = new long[totalDeleteEventCount];
            readAllDeleteEventsFromDeleteDeltas();
          }
        }
      } catch(IOException|DeleteEventsOverflowMemoryException e) {
        close(); // close any open readers, if there was some exception during initialization.
        throw e; // rethrow the exception so that the caller can handle.
      }
    }

    private void readAllDeleteEventsFromDeleteDeltas() throws IOException {
      if (sortMerger == null || sortMerger.isEmpty()) return; // trivial case, nothing to read.
      int distinctOtids = 0;
      long lastSeenOtid = -1;
      long otids[] = new long[rowIds.length];
      int index = 0;
      while (!sortMerger.isEmpty()) {
        // The sortMerger is a heap data structure that stores a pair of
        // (deleteRecordKey, deleteReaderValue) at each node and is ordered by deleteRecordKey.
        // The deleteReaderValue is the actual wrapper class that has the reference to the
        // underlying delta file that is being read, and its corresponding deleteRecordKey
        // is the smallest record id for that file. In each iteration of this loop, we extract(poll)
        // the minimum deleteRecordKey pair. Once we have processed that deleteRecordKey, we
        // advance the pointer for the corresponding deleteReaderValue. If the underlying file
        // itself has no more records, then we remove that pair from the heap, or else we
        // add the updated pair back to the heap.
        Entry<DeleteRecordKey, DeleteReaderValue> entry = sortMerger.pollFirstEntry();
        DeleteRecordKey deleteRecordKey = entry.getKey();
        DeleteReaderValue deleteReaderValue = entry.getValue();
        otids[index] = deleteRecordKey.originalTransactionId;
        rowIds[index] = deleteRecordKey.rowId;
        ++index;
        if (lastSeenOtid != deleteRecordKey.originalTransactionId) {
          ++distinctOtids;
          lastSeenOtid = deleteRecordKey.originalTransactionId;
        }
        if (deleteReaderValue.next(deleteRecordKey)) {
          sortMerger.put(deleteRecordKey, deleteReaderValue);
        } else {
          deleteReaderValue.close(); // Exhausted reading all records, close the reader.
        }
      }

      // Once we have processed all the delete events and seen all the distinct otids,
      // we compress the otids into CompressedOtid data structure that records
      // the fromIndex(inclusive) and toIndex(exclusive) for each unique otid.
      this.compressedOtids = new CompressedOtid[distinctOtids];
      lastSeenOtid = otids[0];
      int fromIndex = 0, pos = 0;
      for (int i = 1; i < otids.length; ++i) {
        if (otids[i] != lastSeenOtid) {
          compressedOtids[pos] = new CompressedOtid(lastSeenOtid, fromIndex, i);
          lastSeenOtid = otids[i];
          fromIndex = i;
          ++pos;
        }
      }
      // account for the last distinct otid
      compressedOtids[pos] = new CompressedOtid(lastSeenOtid, fromIndex, otids.length);
    }

    private boolean isDeleted(long otid, long rowId) {
      if (compressedOtids == null || rowIds == null) {
        return false;
      }
      // To find if a given (otid, rowId) pair is deleted or not, we perform
      // two binary searches at most. The first binary search is on the
      // compressed otids. If a match is found, only then we do the next
      // binary search in the larger rowId vector between the given toIndex & fromIndex.

      // Check if otid is outside the range of all otids present.
      if (otid < compressedOtids[0].originalTransactionId
          || otid > compressedOtids[compressedOtids.length - 1].originalTransactionId) {
        return false;
      }
      // Create a dummy key for searching the otid in the compressed otid ranges.
      CompressedOtid key = new CompressedOtid(otid, -1, -1);
      int pos = Arrays.binarySearch(compressedOtids, key);
      if (pos >= 0) {
        // Otid with the given value found! Searching now for rowId...
        key = compressedOtids[pos]; // Retrieve the actual CompressedOtid that matched.
        // Check if rowId is outside the range of all rowIds present for this otid.
        if (rowId < rowIds[key.fromIndex]
            || rowId > rowIds[key.toIndex - 1]) {
          return false;
        }
        if (Arrays.binarySearch(rowIds, key.fromIndex, key.toIndex, rowId) >= 0) {
          return true; // rowId also found!
        }
      }
      return false;
    }

    @Override
    public void findDeletedRecords(VectorizedRowBatch batch, BitSet selectedBitSet)
        throws IOException {
      if (rowIds == null || compressedOtids == null) {
        return;
      }
      // Iterate through the batch and for each (otid, rowid) in the batch
      // check if it is deleted or not.

      long[] originalTransactionVector =
          batch.cols[OrcRecordUpdater.ORIGINAL_TRANSACTION].isRepeating ? null
              : ((LongColumnVector) batch.cols[OrcRecordUpdater.ORIGINAL_TRANSACTION]).vector;
      long repeatedOriginalTransaction = (originalTransactionVector != null) ? -1
          : ((LongColumnVector) batch.cols[OrcRecordUpdater.ORIGINAL_TRANSACTION]).vector[0];

      long[] rowIdVector =
          ((LongColumnVector) batch.cols[OrcRecordUpdater.ROW_ID]).vector;

      for (int setBitIndex = selectedBitSet.nextSetBit(0);
          setBitIndex >= 0;
          setBitIndex = selectedBitSet.nextSetBit(setBitIndex+1)) {
        long otid = originalTransactionVector != null ? originalTransactionVector[setBitIndex]
                                                    : repeatedOriginalTransaction ;
        long rowId = rowIdVector[setBitIndex];
        if (isDeleted(otid, rowId)) {
          selectedBitSet.clear(setBitIndex);
        }
     }
    }

    @Override
    public void close() throws IOException {
      // ColumnizedDeleteEventRegistry reads all the delete events into memory during initialization
      // and it closes the delete event readers after it. If an exception gets thrown during
      // initialization, we may have to close any readers that are still left open.
      while (!sortMerger.isEmpty()) {
        Entry<DeleteRecordKey, DeleteReaderValue> entry = sortMerger.pollFirstEntry();
        entry.getValue().close(); // close the reader for this entry
      }
    }
  }

  static class DeleteEventsOverflowMemoryException extends Exception {
    private static final long serialVersionUID = 1L;
  }
}