/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.io.orc;
import java.io.IOException;
import java.util.Arrays;
import java.util.BitSet;
import java.util.List;
import java.util.Map.Entry;
import java.util.TreeMap;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.ValidReadTxnList;
import org.apache.hadoop.hive.common.ValidTxnList;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx;
import org.apache.hadoop.hive.ql.io.AcidUtils;
import org.apache.hadoop.hive.ql.io.RecordIdentifier;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Reporter;
import org.apache.orc.OrcProto;
import org.apache.orc.OrcUtils;
import org.apache.orc.TypeDescription;
import org.apache.orc.impl.AcidStats;
import org.apache.orc.impl.OrcAcidUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.annotations.VisibleForTesting;
/**
* A fast vectorized batch reader class for ACID when split-update behavior is enabled.
* When split-update is turned on, row-by-row stitching could be avoided to create the final
* version of a row. Essentially, there are only insert and delete events. Insert events can be
* directly read from the base files/insert_only deltas in vectorized row batches. The deleted
* rows can then be easily indicated via the 'selected' field of the vectorized row batch.
* Refer HIVE-14233 for more details.
*/
public class VectorizedOrcAcidRowBatchReader
implements org.apache.hadoop.mapred.RecordReader<NullWritable,VectorizedRowBatch> {
private static final Logger LOG = LoggerFactory.getLogger(VectorizedOrcAcidRowBatchReader.class);
private org.apache.hadoop.hive.ql.io.orc.RecordReader baseReader;
private VectorizedRowBatchCtx rbCtx;
private VectorizedRowBatch vectorizedRowBatchBase;
private long offset;
private long length;
private float progress = 0.0f;
private Object[] partitionValues;
private boolean addPartitionCols = true;
private ValidTxnList validTxnList;
private DeleteEventRegistry deleteEventRegistry;
public VectorizedOrcAcidRowBatchReader(InputSplit inputSplit, JobConf conf,
Reporter reporter) throws IOException {
final boolean isAcidRead = HiveConf.getBoolVar(conf, ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN);
final AcidUtils.AcidOperationalProperties acidOperationalProperties
= AcidUtils.getAcidOperationalProperties(conf);
// This type of VectorizedOrcAcidRowBatchReader can only be created when split-update is
// enabled for an ACID case and the file format is ORC.
boolean isReadNotAllowed = !isAcidRead || !acidOperationalProperties.isSplitUpdate()
|| !(inputSplit instanceof OrcSplit);
if (isReadNotAllowed) {
OrcInputFormat.raiseAcidTablesMustBeReadWithAcidReaderException(conf);
}
final OrcSplit orcSplit = (OrcSplit) inputSplit;
rbCtx = Utilities.getVectorizedRowBatchCtx(conf);
reporter.setStatus(orcSplit.toString());
Reader reader = OrcInputFormat.createOrcReaderForSplit(conf, orcSplit);
Reader.Options readerOptions = OrcInputFormat.createOptionsForReader(conf);
readerOptions = OrcRawRecordMerger.createEventOptions(readerOptions);
this.offset = orcSplit.getStart();
this.length = orcSplit.getLength();
// Careful with the range here now, we do not want to read the whole base file like deltas.
this.baseReader = reader.rowsOptions(readerOptions.range(offset, length));
// VectorizedRowBatchBase schema is picked up from the baseReader because the SchemaEvolution
// stuff happens at the ORC layer that understands how to map user schema to acid schema.
if (this.baseReader instanceof RecordReaderImpl) {
this.vectorizedRowBatchBase = ((RecordReaderImpl) this.baseReader).createRowBatch();
} else {
throw new IOException("Failed to create vectorized row batch for the reader of type "
+ this.baseReader.getClass().getName());
}
int partitionColumnCount = (rbCtx != null) ? rbCtx.getPartitionColumnCount() : 0;
if (partitionColumnCount > 0) {
partitionValues = new Object[partitionColumnCount];
VectorizedRowBatchCtx.getPartitionValues(rbCtx, conf, orcSplit, partitionValues);
} else {
partitionValues = null;
}
String txnString = conf.get(ValidTxnList.VALID_TXNS_KEY);
this.validTxnList = (txnString == null) ? new ValidReadTxnList() : new ValidReadTxnList(txnString);
// Clone readerOptions for deleteEvents.
Reader.Options deleteEventReaderOptions = readerOptions.clone();
// Set the range on the deleteEventReaderOptions to 0 to INTEGER_MAX because
// we always want to read all the delete delta files.
deleteEventReaderOptions.range(0, Long.MAX_VALUE);
// Disable SARGs for deleteEventReaders, as SARGs have no meaning.
deleteEventReaderOptions.searchArgument(null, null);
try {
// See if we can load all the delete events from all the delete deltas in memory...
this.deleteEventRegistry = new ColumnizedDeleteEventRegistry(conf, orcSplit, deleteEventReaderOptions);
} catch (DeleteEventsOverflowMemoryException e) {
// If not, then create a set of hanging readers that do sort-merge to find the next smallest
// delete event on-demand. Caps the memory consumption to (some_const * no. of readers).
this.deleteEventRegistry = new SortMergedDeleteEventRegistry(conf, orcSplit, deleteEventReaderOptions);
}
}
/**
* Returns whether it is possible to create a valid instance of this class for a given split.
* @param conf is the job configuration
* @param inputSplit
* @return true if it is possible, else false.
*/
public static boolean canCreateVectorizedAcidRowBatchReaderOnSplit(JobConf conf, InputSplit inputSplit) {
if (!(inputSplit instanceof OrcSplit)) {
return false; // must be an instance of OrcSplit.
}
// First check if we are reading any original files in the split.
// To simplify the vectorization logic, the vectorized acid row batch reader does not handle
// original files for now as they have a different schema than a regular ACID file.
final OrcSplit split = (OrcSplit) inputSplit;
if (AcidUtils.getAcidOperationalProperties(conf).isSplitUpdate() && !split.isOriginal()) {
// When split-update is turned on for ACID, a more optimized vectorized batch reader
// can be created. But still only possible when we are *NOT* reading any originals.
return true;
}
return false; // no split-update or possibly reading originals!
}
private static Path[] getDeleteDeltaDirsFromSplit(OrcSplit orcSplit) throws IOException {
Path path = orcSplit.getPath();
Path root;
if (orcSplit.hasBase()) {
if (orcSplit.isOriginal()) {
root = path.getParent();
} else {
root = path.getParent().getParent();
}
} else {
root = path;
}
return AcidUtils.deserializeDeleteDeltas(root, orcSplit.getDeltas());
}
@Override
public boolean next(NullWritable key, VectorizedRowBatch value) throws IOException {
try {
// Check and update partition cols if necessary. Ideally, this should be done
// in CreateValue as the partition is constant per split. But since Hive uses
// CombineHiveRecordReader and
// as this does not call CreateValue for each new RecordReader it creates, this check is
// required in next()
if (addPartitionCols) {
if (partitionValues != null) {
rbCtx.addPartitionColsToBatch(value, partitionValues);
}
addPartitionCols = false;
}
if (!baseReader.nextBatch(vectorizedRowBatchBase)) {
return false;
}
} catch (Exception e) {
throw new IOException("error iterating", e);
}
// Once we have read the VectorizedRowBatchBase from the file, there are two kinds of cases
// for which we might have to discard rows from the batch:
// Case 1- when the row is created by a transaction that is not valid, or
// Case 2- when the row has been deleted.
// We will go through the batch to discover rows which match any of the cases and specifically
// remove them from the selected vector. Of course, selectedInUse should also be set.
BitSet selectedBitSet = new BitSet(vectorizedRowBatchBase.size);
if (vectorizedRowBatchBase.selectedInUse) {
// When selectedInUse is true, start with every bit set to false and selectively set
// certain bits to true based on the selected[] vector.
selectedBitSet.set(0, vectorizedRowBatchBase.size, false);
for (int j = 0; j < vectorizedRowBatchBase.size; ++j) {
int i = vectorizedRowBatchBase.selected[j];
selectedBitSet.set(i);
}
} else {
// When selectedInUse is set to false, everything in the batch is selected.
selectedBitSet.set(0, vectorizedRowBatchBase.size, true);
}
// Case 1- find rows which belong to transactions that are not valid.
findRecordsWithInvalidTransactionIds(vectorizedRowBatchBase, selectedBitSet);
// Case 2- find rows which have been deleted.
this.deleteEventRegistry.findDeletedRecords(vectorizedRowBatchBase, selectedBitSet);
if (selectedBitSet.cardinality() == vectorizedRowBatchBase.size) {
// None of the cases above matched and everything is selected. Hence, we will use the
// same values for the selected and selectedInUse.
value.size = vectorizedRowBatchBase.size;
value.selected = vectorizedRowBatchBase.selected;
value.selectedInUse = vectorizedRowBatchBase.selectedInUse;
} else {
value.size = selectedBitSet.cardinality();
value.selectedInUse = true;
value.selected = new int[selectedBitSet.cardinality()];
// This loop fills up the selected[] vector with all the index positions that are selected.
for (int setBitIndex = selectedBitSet.nextSetBit(0), selectedItr = 0;
setBitIndex >= 0;
setBitIndex = selectedBitSet.nextSetBit(setBitIndex+1), ++selectedItr) {
value.selected[selectedItr] = setBitIndex;
}
}
// Finally, link up the columnVector from the base VectorizedRowBatch to outgoing batch.
// NOTE: We only link up the user columns and not the ACID metadata columns because this
// vectorized code path is not being used in cases of update/delete, when the metadata columns
// would be expected to be passed up the operator pipeline. This is because
// currently the update/delete specifically disable vectorized code paths.
// This happens at ql/exec/Utilities.java::3293 when it checks for mapWork.getVectorMode()
StructColumnVector payloadStruct = (StructColumnVector) vectorizedRowBatchBase.cols[OrcRecordUpdater.ROW];
// Transfer columnVector objects from base batch to outgoing batch.
System.arraycopy(payloadStruct.fields, 0, value.cols, 0, value.getDataColumnCount());
progress = baseReader.getProgress();
return true;
}
private void findRecordsWithInvalidTransactionIds(VectorizedRowBatch batch, BitSet selectedBitSet) {
if (batch.cols[OrcRecordUpdater.CURRENT_TRANSACTION].isRepeating) {
// When we have repeating values, we can unset the whole bitset at once
// if the repeating value is not a valid transaction.
long currentTransactionIdForBatch = ((LongColumnVector)
batch.cols[OrcRecordUpdater.CURRENT_TRANSACTION]).vector[0];
if (!validTxnList.isTxnValid(currentTransactionIdForBatch)) {
selectedBitSet.clear(0, batch.size);
}
return;
}
long[] currentTransactionVector =
((LongColumnVector) batch.cols[OrcRecordUpdater.CURRENT_TRANSACTION]).vector;
// Loop through the bits that are set to true and mark those rows as false, if their
// current transactions are not valid.
for (int setBitIndex = selectedBitSet.nextSetBit(0);
setBitIndex >= 0;
setBitIndex = selectedBitSet.nextSetBit(setBitIndex+1)) {
if (!validTxnList.isTxnValid(currentTransactionVector[setBitIndex])) {
selectedBitSet.clear(setBitIndex);
}
}
}
@Override
public NullWritable createKey() {
return NullWritable.get();
}
@Override
public VectorizedRowBatch createValue() {
return rbCtx.createVectorizedRowBatch();
}
@Override
public long getPos() throws IOException {
return offset + (long) (progress * length);
}
@Override
public void close() throws IOException {
try {
this.baseReader.close();
} finally {
this.deleteEventRegistry.close();
}
}
@Override
public float getProgress() throws IOException {
return progress;
}
@VisibleForTesting
DeleteEventRegistry getDeleteEventRegistry() {
return deleteEventRegistry;
}
/**
* An interface that can determine which rows have been deleted
* from a given vectorized row batch. Implementations of this interface
* will read the delete delta files and will create their own internal
* data structures to maintain record ids of the records that got deleted.
*/
static interface DeleteEventRegistry {
/**
* Modifies the passed bitset to indicate which of the rows in the batch
* have been deleted. Assumes that the batch.size is equal to bitset size.
* @param batch
* @param selectedBitSet
* @throws IOException
*/
public void findDeletedRecords(VectorizedRowBatch batch, BitSet selectedBitSet) throws IOException;
/**
* The close() method can be called externally to signal the implementing classes
* to free up resources.
* @throws IOException
*/
public void close() throws IOException;
}
/**
* An implementation for DeleteEventRegistry that opens the delete delta files all
* at once, and then uses the sort-merge algorithm to maintain a sorted list of
* delete events. This internally uses the OrcRawRecordMerger and maintains a constant
* amount of memory usage, given the number of delete delta files. Therefore, this
* implementation will be picked up when the memory pressure is high.
*/
static class SortMergedDeleteEventRegistry implements DeleteEventRegistry {
private OrcRawRecordMerger deleteRecords;
private OrcRawRecordMerger.ReaderKey deleteRecordKey;
private OrcStruct deleteRecordValue;
private boolean isDeleteRecordAvailable = true;
private ValidTxnList validTxnList;
public SortMergedDeleteEventRegistry(JobConf conf, OrcSplit orcSplit, Reader.Options readerOptions)
throws IOException {
final Path[] deleteDeltas = getDeleteDeltaDirsFromSplit(orcSplit);
if (deleteDeltas.length > 0) {
int bucket = AcidUtils.parseBaseOrDeltaBucketFilename(orcSplit.getPath(), conf).getBucket();
String txnString = conf.get(ValidTxnList.VALID_TXNS_KEY);
this.validTxnList = (txnString == null) ? new ValidReadTxnList() : new ValidReadTxnList(txnString);
this.deleteRecords = new OrcRawRecordMerger(conf, true, null, false, bucket,
validTxnList, readerOptions, deleteDeltas);
this.deleteRecordKey = new OrcRawRecordMerger.ReaderKey();
this.deleteRecordValue = this.deleteRecords.createValue();
// Initialize the first value in the delete reader.
this.isDeleteRecordAvailable = this.deleteRecords.next(deleteRecordKey, deleteRecordValue);
} else {
this.isDeleteRecordAvailable = false;
this.deleteRecordKey = null;
this.deleteRecordValue = null;
this.deleteRecords = null;
}
}
@Override
public void findDeletedRecords(VectorizedRowBatch batch, BitSet selectedBitSet)
throws IOException {
if (!isDeleteRecordAvailable) {
return;
}
long[] originalTransaction =
batch.cols[OrcRecordUpdater.ORIGINAL_TRANSACTION].isRepeating ? null
: ((LongColumnVector) batch.cols[OrcRecordUpdater.ORIGINAL_TRANSACTION]).vector;
long[] bucket =
batch.cols[OrcRecordUpdater.BUCKET].isRepeating ? null
: ((LongColumnVector) batch.cols[OrcRecordUpdater.BUCKET]).vector;
long[] rowId =
batch.cols[OrcRecordUpdater.ROW_ID].isRepeating ? null
: ((LongColumnVector) batch.cols[OrcRecordUpdater.ROW_ID]).vector;
// The following repeatedX values will be set, if any of the columns are repeating.
long repeatedOriginalTransaction = (originalTransaction != null) ? -1
: ((LongColumnVector) batch.cols[OrcRecordUpdater.ORIGINAL_TRANSACTION]).vector[0];
long repeatedBucket = (bucket != null) ? -1
: ((LongColumnVector) batch.cols[OrcRecordUpdater.BUCKET]).vector[0];
long repeatedRowId = (rowId != null) ? -1
: ((LongColumnVector) batch.cols[OrcRecordUpdater.ROW_ID]).vector[0];
// Get the first valid row in the batch still available.
int firstValidIndex = selectedBitSet.nextSetBit(0);
if (firstValidIndex == -1) {
return; // Everything in the batch has already been filtered out.
}
RecordIdentifier firstRecordIdInBatch =
new RecordIdentifier(
originalTransaction != null ? originalTransaction[firstValidIndex] : repeatedOriginalTransaction,
bucket != null ? (int) bucket[firstValidIndex] : (int) repeatedBucket,
rowId != null ? (int) rowId[firstValidIndex] : repeatedRowId);
// Get the last valid row in the batch still available.
int lastValidIndex = selectedBitSet.previousSetBit(batch.size - 1);
RecordIdentifier lastRecordIdInBatch =
new RecordIdentifier(
originalTransaction != null ? originalTransaction[lastValidIndex] : repeatedOriginalTransaction,
bucket != null ? (int) bucket[lastValidIndex] : (int) repeatedBucket,
rowId != null ? (int) rowId[lastValidIndex] : repeatedRowId);
// We must iterate over all the delete records, until we find one record with
// deleteRecord >= firstRecordInBatch or until we exhaust all the delete records.
while (deleteRecordKey.compareRow(firstRecordIdInBatch) == -1) {
isDeleteRecordAvailable = deleteRecords.next(deleteRecordKey, deleteRecordValue);
if (!isDeleteRecordAvailable) return; // exhausted all delete records, return.
}
// If we are here, then we have established that firstRecordInBatch <= deleteRecord.
// Now continue marking records which have been deleted until we reach the end of the batch
// or we exhaust all the delete records.
int currIndex = firstValidIndex;
RecordIdentifier currRecordIdInBatch = new RecordIdentifier();
while (isDeleteRecordAvailable && currIndex != -1 && currIndex <= lastValidIndex) {
currRecordIdInBatch.setValues(
(originalTransaction != null) ? originalTransaction[currIndex] : repeatedOriginalTransaction,
(bucket != null) ? (int) bucket[currIndex] : (int) repeatedBucket,
(rowId != null) ? rowId[currIndex] : repeatedRowId);
if (deleteRecordKey.compareRow(currRecordIdInBatch) == 0) {
// When deleteRecordId == currRecordIdInBatch, this record in the batch has been deleted.
selectedBitSet.clear(currIndex);
currIndex = selectedBitSet.nextSetBit(currIndex + 1); // Move to next valid index.
} else if (deleteRecordKey.compareRow(currRecordIdInBatch) == 1) {
// When deleteRecordId > currRecordIdInBatch, we have to move on to look at the
// next record in the batch.
// But before that, can we short-circuit and skip the entire batch itself
// by checking if the deleteRecordId > lastRecordInBatch?
if (deleteRecordKey.compareRow(lastRecordIdInBatch) == 1) {
return; // Yay! We short-circuited, skip everything remaining in the batch and return.
}
currIndex = selectedBitSet.nextSetBit(currIndex + 1); // Move to next valid index.
} else {
// We have deleteRecordId < currRecordIdInBatch, we must now move on to find
// next the larger deleteRecordId that can possibly match anything in the batch.
isDeleteRecordAvailable = deleteRecords.next(deleteRecordKey, deleteRecordValue);
}
}
}
@Override
public void close() throws IOException {
if (this.deleteRecords != null) {
this.deleteRecords.close();
}
}
}
/**
* An implementation for DeleteEventRegistry that optimizes for performance by loading
* all the delete events into memory at once from all the delete delta files.
* It starts by reading all the delete events through a regular sort merge logic
* into two vectors- one for original transaction id (otid), and the other for row id.
* (In the current version, since the bucket id should be same for all the delete deltas,
* it is not stored). The otids are likely to be repeated very often, as a single transaction
* often deletes thousands of rows. Hence, the otid vector is compressed to only store the
* toIndex and fromIndex ranges in the larger row id vector. Now, querying whether a
* record id is deleted or not, is done by performing a binary search on the
* compressed otid range. If a match is found, then a binary search is then performed on
* the larger rowId vector between the given toIndex and fromIndex. Of course, there is rough
* heuristic that prevents creation of an instance of this class if the memory pressure is high.
* The SortMergedDeleteEventRegistry is then the fallback method for such scenarios.
*/
static class ColumnizedDeleteEventRegistry implements DeleteEventRegistry {
/**
* A simple wrapper class to hold the (otid, rowId) pair.
*/
static class DeleteRecordKey implements Comparable<DeleteRecordKey> {
private long originalTransactionId;
private long rowId;
public DeleteRecordKey() {
this.originalTransactionId = -1;
this.rowId = -1;
}
public DeleteRecordKey(long otid, long rowId) {
this.originalTransactionId = otid;
this.rowId = rowId;
}
public void set(long otid, long rowId) {
this.originalTransactionId = otid;
this.rowId = rowId;
}
@Override
public int compareTo(DeleteRecordKey other) {
if (other == null) {
return -1;
}
if (originalTransactionId != other.originalTransactionId) {
return originalTransactionId < other.originalTransactionId ? -1 : 1;
}
if (rowId != other.rowId) {
return rowId < other.rowId ? -1 : 1;
}
return 0;
}
}
/**
* This class actually reads the delete delta files in vectorized row batches.
* For every call to next(), it returns the next smallest record id in the file if available.
* Internally, the next() buffers a row batch and maintains an index pointer, reading the
* next batch when the previous batch is exhausted.
*/
static class DeleteReaderValue {
private VectorizedRowBatch batch;
private final RecordReader recordReader;
private int indexPtrInBatch;
private final int bucketForSplit; // The bucket value should be same for all the records.
private final ValidTxnList validTxnList;
public DeleteReaderValue(Reader deleteDeltaReader, Reader.Options readerOptions, int bucket,
ValidTxnList validTxnList) throws IOException {
this.recordReader = deleteDeltaReader.rowsOptions(readerOptions);
this.bucketForSplit = bucket;
this.batch = deleteDeltaReader.getSchema().createRowBatch();
if (!recordReader.nextBatch(batch)) { // Read the first batch.
this.batch = null; // Oh! the first batch itself was null. Close the reader.
}
this.indexPtrInBatch = 0;
this.validTxnList = validTxnList;
}
public boolean next(DeleteRecordKey deleteRecordKey) throws IOException {
if (batch == null) {
return false;
}
boolean isValidNext = false;
while (!isValidNext) {
if (indexPtrInBatch >= batch.size) {
// We have exhausted our current batch, read the next batch.
if (recordReader.nextBatch(batch)) {
// Whenever we are reading a batch, we must ensure that all the records in the batch
// have the same bucket id as the bucket id of the split. If not, throw exception.
// NOTE: this assertion might not hold, once virtual bucketing is in place. However,
// it should be simple to fix that case. Just replace check for bucket equality with
// a check for valid bucket mapping. Until virtual bucketing is added, it means
// either the split computation got messed up or we found some corrupted records.
long bucketForRecord = ((LongColumnVector) batch.cols[OrcRecordUpdater.BUCKET]).vector[0];
if ((batch.size > 1 && !batch.cols[OrcRecordUpdater.BUCKET].isRepeating)
|| (bucketForRecord != bucketForSplit)){
throw new IOException("Corrupted records with different bucket ids "
+ "from the containing bucket file found! Expected bucket id "
+ bucketForSplit + ", however found the bucket id " + bucketForRecord);
}
indexPtrInBatch = 0; // After reading the batch, reset the pointer to beginning.
} else {
return false; // no more batches to read, exhausted the reader.
}
}
int originalTransactionIndex =
batch.cols[OrcRecordUpdater.ORIGINAL_TRANSACTION].isRepeating ? 0 : indexPtrInBatch;
long originalTransaction =
((LongColumnVector) batch.cols[OrcRecordUpdater.ORIGINAL_TRANSACTION]).vector[originalTransactionIndex];
long rowId = ((LongColumnVector) batch.cols[OrcRecordUpdater.ROW_ID]).vector[indexPtrInBatch];
int currentTransactionIndex =
batch.cols[OrcRecordUpdater.CURRENT_TRANSACTION].isRepeating ? 0 : indexPtrInBatch;
long currentTransaction =
((LongColumnVector) batch.cols[OrcRecordUpdater.CURRENT_TRANSACTION]).vector[currentTransactionIndex];
++indexPtrInBatch;
if (validTxnList.isTxnValid(currentTransaction)) {
isValidNext = true;
deleteRecordKey.set(originalTransaction, rowId);
}
}
return true;
}
public void close() throws IOException {
this.recordReader.close();
}
}
/**
* A CompressedOtid class stores a compressed representation of the original
* transaction ids (otids) read from the delete delta files. Since the record ids
* are sorted by (otid, rowId) and otids are highly likely to be repetitive, it is
* efficient to compress them as a CompressedOtid that stores the fromIndex and
* the toIndex. These fromIndex and toIndex reference the larger vector formed by
* concatenating the correspondingly ordered rowIds.
*/
private class CompressedOtid implements Comparable<CompressedOtid> {
long originalTransactionId;
int fromIndex; // inclusive
int toIndex; // exclusive
public CompressedOtid(long otid, int fromIndex, int toIndex) {
this.originalTransactionId = otid;
this.fromIndex = fromIndex;
this.toIndex = toIndex;
}
@Override
public int compareTo(CompressedOtid other) {
// When comparing the CompressedOtid, the one with the lesser value is smaller.
if (originalTransactionId != other.originalTransactionId) {
return originalTransactionId < other.originalTransactionId ? -1 : 1;
}
return 0;
}
}
private TreeMap<DeleteRecordKey, DeleteReaderValue> sortMerger;
private long rowIds[];
private CompressedOtid compressedOtids[];
private ValidTxnList validTxnList;
public ColumnizedDeleteEventRegistry(JobConf conf, OrcSplit orcSplit,
Reader.Options readerOptions) throws IOException, DeleteEventsOverflowMemoryException {
int bucket = AcidUtils.parseBaseOrDeltaBucketFilename(orcSplit.getPath(), conf).getBucket();
String txnString = conf.get(ValidTxnList.VALID_TXNS_KEY);
this.validTxnList = (txnString == null) ? new ValidReadTxnList() : new ValidReadTxnList(txnString);
this.sortMerger = new TreeMap<DeleteRecordKey, DeleteReaderValue>();
this.rowIds = null;
this.compressedOtids = null;
int maxEventsInMemory = HiveConf.getIntVar(conf, ConfVars.HIVE_TRANSACTIONAL_NUM_EVENTS_IN_MEMORY);
try {
final Path[] deleteDeltaDirs = getDeleteDeltaDirsFromSplit(orcSplit);
if (deleteDeltaDirs.length > 0) {
int totalDeleteEventCount = 0;
for (Path deleteDeltaDir : deleteDeltaDirs) {
Path deleteDeltaFile = AcidUtils.createBucketFile(deleteDeltaDir, bucket);
FileSystem fs = deleteDeltaFile.getFileSystem(conf);
// NOTE: Calling last flush length below is more for future-proofing when we have
// streaming deletes. But currently we don't support streaming deletes, and this can
// be removed if this becomes a performance issue.
long length = OrcAcidUtils.getLastFlushLength(fs, deleteDeltaFile);
// NOTE: A check for existence of deleteDeltaFile is required because we may not have
// deletes for the bucket being taken into consideration for this split processing.
if (length != -1 && fs.exists(deleteDeltaFile)) {
Reader deleteDeltaReader = OrcFile.createReader(deleteDeltaFile,
OrcFile.readerOptions(conf).maxLength(length));
AcidStats acidStats = OrcAcidUtils.parseAcidStats(deleteDeltaReader);
if (acidStats.deletes == 0) {
continue; // just a safe check to ensure that we are not reading empty delete files.
}
totalDeleteEventCount += acidStats.deletes;
if (totalDeleteEventCount > maxEventsInMemory) {
// ColumnizedDeleteEventRegistry loads all the delete events from all the delete deltas
// into memory. To prevent out-of-memory errors, this check is a rough heuristic that
// prevents creation of an object of this class if the total number of delete events
// exceed this value. By default, it has been set to 10 million delete events per bucket.
LOG.info("Total number of delete events exceeds the maximum number of delete events "
+ "that can be loaded into memory for the delete deltas in the directory at : "
+ deleteDeltaDirs.toString() +". The max limit is currently set at "
+ maxEventsInMemory + " and can be changed by setting the Hive config variable "
+ ConfVars.HIVE_TRANSACTIONAL_NUM_EVENTS_IN_MEMORY.varname);
throw new DeleteEventsOverflowMemoryException();
}
DeleteReaderValue deleteReaderValue = new DeleteReaderValue(deleteDeltaReader,
readerOptions, bucket, validTxnList);
DeleteRecordKey deleteRecordKey = new DeleteRecordKey();
if (deleteReaderValue.next(deleteRecordKey)) {
sortMerger.put(deleteRecordKey, deleteReaderValue);
} else {
deleteReaderValue.close();
}
}
}
if (totalDeleteEventCount > 0) {
// Initialize the rowId array when we have some delete events.
rowIds = new long[totalDeleteEventCount];
readAllDeleteEventsFromDeleteDeltas();
}
}
} catch(IOException|DeleteEventsOverflowMemoryException e) {
close(); // close any open readers, if there was some exception during initialization.
throw e; // rethrow the exception so that the caller can handle.
}
}
private void readAllDeleteEventsFromDeleteDeltas() throws IOException {
if (sortMerger == null || sortMerger.isEmpty()) return; // trivial case, nothing to read.
int distinctOtids = 0;
long lastSeenOtid = -1;
long otids[] = new long[rowIds.length];
int index = 0;
while (!sortMerger.isEmpty()) {
// The sortMerger is a heap data structure that stores a pair of
// (deleteRecordKey, deleteReaderValue) at each node and is ordered by deleteRecordKey.
// The deleteReaderValue is the actual wrapper class that has the reference to the
// underlying delta file that is being read, and its corresponding deleteRecordKey
// is the smallest record id for that file. In each iteration of this loop, we extract(poll)
// the minimum deleteRecordKey pair. Once we have processed that deleteRecordKey, we
// advance the pointer for the corresponding deleteReaderValue. If the underlying file
// itself has no more records, then we remove that pair from the heap, or else we
// add the updated pair back to the heap.
Entry<DeleteRecordKey, DeleteReaderValue> entry = sortMerger.pollFirstEntry();
DeleteRecordKey deleteRecordKey = entry.getKey();
DeleteReaderValue deleteReaderValue = entry.getValue();
otids[index] = deleteRecordKey.originalTransactionId;
rowIds[index] = deleteRecordKey.rowId;
++index;
if (lastSeenOtid != deleteRecordKey.originalTransactionId) {
++distinctOtids;
lastSeenOtid = deleteRecordKey.originalTransactionId;
}
if (deleteReaderValue.next(deleteRecordKey)) {
sortMerger.put(deleteRecordKey, deleteReaderValue);
} else {
deleteReaderValue.close(); // Exhausted reading all records, close the reader.
}
}
// Once we have processed all the delete events and seen all the distinct otids,
// we compress the otids into CompressedOtid data structure that records
// the fromIndex(inclusive) and toIndex(exclusive) for each unique otid.
this.compressedOtids = new CompressedOtid[distinctOtids];
lastSeenOtid = otids[0];
int fromIndex = 0, pos = 0;
for (int i = 1; i < otids.length; ++i) {
if (otids[i] != lastSeenOtid) {
compressedOtids[pos] = new CompressedOtid(lastSeenOtid, fromIndex, i);
lastSeenOtid = otids[i];
fromIndex = i;
++pos;
}
}
// account for the last distinct otid
compressedOtids[pos] = new CompressedOtid(lastSeenOtid, fromIndex, otids.length);
}
private boolean isDeleted(long otid, long rowId) {
if (compressedOtids == null || rowIds == null) {
return false;
}
// To find if a given (otid, rowId) pair is deleted or not, we perform
// two binary searches at most. The first binary search is on the
// compressed otids. If a match is found, only then we do the next
// binary search in the larger rowId vector between the given toIndex & fromIndex.
// Check if otid is outside the range of all otids present.
if (otid < compressedOtids[0].originalTransactionId
|| otid > compressedOtids[compressedOtids.length - 1].originalTransactionId) {
return false;
}
// Create a dummy key for searching the otid in the compressed otid ranges.
CompressedOtid key = new CompressedOtid(otid, -1, -1);
int pos = Arrays.binarySearch(compressedOtids, key);
if (pos >= 0) {
// Otid with the given value found! Searching now for rowId...
key = compressedOtids[pos]; // Retrieve the actual CompressedOtid that matched.
// Check if rowId is outside the range of all rowIds present for this otid.
if (rowId < rowIds[key.fromIndex]
|| rowId > rowIds[key.toIndex - 1]) {
return false;
}
if (Arrays.binarySearch(rowIds, key.fromIndex, key.toIndex, rowId) >= 0) {
return true; // rowId also found!
}
}
return false;
}
@Override
public void findDeletedRecords(VectorizedRowBatch batch, BitSet selectedBitSet)
throws IOException {
if (rowIds == null || compressedOtids == null) {
return;
}
// Iterate through the batch and for each (otid, rowid) in the batch
// check if it is deleted or not.
long[] originalTransactionVector =
batch.cols[OrcRecordUpdater.ORIGINAL_TRANSACTION].isRepeating ? null
: ((LongColumnVector) batch.cols[OrcRecordUpdater.ORIGINAL_TRANSACTION]).vector;
long repeatedOriginalTransaction = (originalTransactionVector != null) ? -1
: ((LongColumnVector) batch.cols[OrcRecordUpdater.ORIGINAL_TRANSACTION]).vector[0];
long[] rowIdVector =
((LongColumnVector) batch.cols[OrcRecordUpdater.ROW_ID]).vector;
for (int setBitIndex = selectedBitSet.nextSetBit(0);
setBitIndex >= 0;
setBitIndex = selectedBitSet.nextSetBit(setBitIndex+1)) {
long otid = originalTransactionVector != null ? originalTransactionVector[setBitIndex]
: repeatedOriginalTransaction ;
long rowId = rowIdVector[setBitIndex];
if (isDeleted(otid, rowId)) {
selectedBitSet.clear(setBitIndex);
}
}
}
@Override
public void close() throws IOException {
// ColumnizedDeleteEventRegistry reads all the delete events into memory during initialization
// and it closes the delete event readers after it. If an exception gets thrown during
// initialization, we may have to close any readers that are still left open.
while (!sortMerger.isEmpty()) {
Entry<DeleteRecordKey, DeleteReaderValue> entry = sortMerger.pollFirstEntry();
entry.getValue().close(); // close the reader for this entry
}
}
}
static class DeleteEventsOverflowMemoryException extends Exception {
private static final long serialVersionUID = 1L;
}
}