/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.io.orc; import java.io.IOException; import java.util.Arrays; import java.util.List; import java.util.Map; import java.util.TreeMap; import org.apache.orc.OrcUtils; import org.apache.orc.StripeInformation; import org.apache.orc.TypeDescription; import org.apache.orc.impl.AcidStats; import org.apache.orc.impl.OrcAcidUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.common.ValidTxnList; import org.apache.hadoop.hive.ql.io.AcidInputFormat; import org.apache.hadoop.hive.ql.io.AcidUtils; import org.apache.hadoop.hive.ql.io.RecordIdentifier; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import com.google.common.annotations.VisibleForTesting; /** * Merges a base and a list of delta files together into a single stream of * events. */ public class OrcRawRecordMerger implements AcidInputFormat.RawReader<OrcStruct>{ private static final Logger LOG = LoggerFactory.getLogger(OrcRawRecordMerger.class); private final Configuration conf; private final boolean collapse; private final RecordReader baseReader; private final ObjectInspector objectInspector; private final long offset; private final long length; private final ValidTxnList validTxnList; private final int columns; private ReaderKey prevKey = new ReaderKey(); // this is the key less than the lowest key we need to process private RecordIdentifier minKey; // this is the last key we need to process private RecordIdentifier maxKey; // an extra value so that we can return it while reading ahead private OrcStruct extraValue; /** * A RecordIdentifier extended with the current transaction id. This is the * key of our merge sort with the originalTransaction, bucket, and rowId * ascending and the currentTransaction, statementId descending. This means that if the * reader is collapsing events to just the last update, just the first * instance of each record is required. */ @VisibleForTesting public final static class ReaderKey extends RecordIdentifier{ private long currentTransactionId; private int statementId;//sort on this descending, like currentTransactionId public ReaderKey() { this(-1, -1, -1, -1, 0); } public ReaderKey(long originalTransaction, int bucket, long rowId, long currentTransactionId) { this(originalTransaction, bucket, rowId, currentTransactionId, 0); } /** * @param statementId - set this to 0 if N/A */ public ReaderKey(long originalTransaction, int bucket, long rowId, long currentTransactionId, int statementId) { super(originalTransaction, bucket, rowId); this.currentTransactionId = currentTransactionId; this.statementId = statementId; } @Override public void set(RecordIdentifier other) { super.set(other); currentTransactionId = ((ReaderKey) other).currentTransactionId; statementId = ((ReaderKey) other).statementId; } public void setValues(long originalTransactionId, int bucket, long rowId, long currentTransactionId, int statementId) { setValues(originalTransactionId, bucket, rowId); this.currentTransactionId = currentTransactionId; this.statementId = statementId; } @Override public boolean equals(Object other) { return super.equals(other) && currentTransactionId == ((ReaderKey) other).currentTransactionId && statementId == ((ReaderKey) other).statementId//consistent with compareTo() ; } @Override public int hashCode() { int result = super.hashCode(); result = 31 * result + (int)(currentTransactionId ^ (currentTransactionId >>> 32)); result = 31 * result + statementId; return result; } @Override public int compareTo(RecordIdentifier other) { int sup = compareToInternal(other); if (sup == 0) { if (other.getClass() == ReaderKey.class) { ReaderKey oth = (ReaderKey) other; if (currentTransactionId != oth.currentTransactionId) { return currentTransactionId < oth.currentTransactionId ? +1 : -1; } if(statementId != oth.statementId) { return statementId < oth.statementId ? +1 : -1; } } else { return -1; } } return sup; } /** * This means 1 txn modified the same row more than once */ private boolean isSameRow(ReaderKey other) { return compareRow(other) == 0 && currentTransactionId == other.currentTransactionId; } public long getCurrentTransactionId() { return currentTransactionId; } /** * Compare rows without considering the currentTransactionId. * @param other the value to compare to * @return -1, 0, +1 */ public int compareRow(RecordIdentifier other) { return compareToInternal(other); } @Override public String toString() { return "{originalTxn: " + getTransactionId() + ", bucket: " + getBucketId() + ", row: " + getRowId() + ", currentTxn: " + currentTransactionId + ", statementId: "+ statementId + "}"; } } /** * A reader and the next record from that reader. The code reads ahead so that * we can return the lowest ReaderKey from each of the readers. Thus, the * next available row is nextRecord and only following records are still in * the reader. */ static class ReaderPair { OrcStruct nextRecord; final Reader reader; final RecordReader recordReader; final ReaderKey key; final RecordIdentifier maxKey; final int bucket; private final int statementId; /** * Create a reader that reads from the first key larger than minKey to any * keys equal to maxKey. * @param key the key to read into * @param reader the ORC file reader * @param bucket the bucket number for the file * @param minKey only return keys larger than minKey if it is non-null * @param maxKey only return keys less than or equal to maxKey if it is * non-null * @param options options to provide to read the rows. * @param statementId id of SQL statement within a transaction * @throws IOException */ ReaderPair(ReaderKey key, Reader reader, int bucket, RecordIdentifier minKey, RecordIdentifier maxKey, ReaderImpl.Options options, int statementId) throws IOException { this.reader = reader; this.key = key; this.maxKey = maxKey; this.bucket = bucket; // TODO use stripe statistics to jump over stripes recordReader = reader.rowsOptions(options); this.statementId = statementId; // advance the reader until we reach the minimum key do { next(nextRecord); } while (nextRecord != null && (minKey != null && key.compareRow(minKey) <= 0)); } void next(OrcStruct next) throws IOException { if (recordReader.hasNext()) { nextRecord = (OrcStruct) recordReader.next(next); // set the key key.setValues(OrcRecordUpdater.getOriginalTransaction(nextRecord), OrcRecordUpdater.getBucket(nextRecord), OrcRecordUpdater.getRowId(nextRecord), OrcRecordUpdater.getCurrentTransaction(nextRecord), statementId); // if this record is larger than maxKey, we need to stop if (maxKey != null && key.compareRow(maxKey) > 0) { LOG.debug("key " + key + " > maxkey " + maxKey); nextRecord = null; recordReader.close(); } } else { nextRecord = null; recordReader.close(); } } int getColumns() { return reader.getTypes().get(OrcRecordUpdater.ROW + 1).getSubtypesCount(); } } /** * A reader that pretends an original base file is a new version base file. * It wraps the underlying reader's row with an ACID event object and * makes the relevant translations. */ static final class OriginalReaderPair extends ReaderPair { OriginalReaderPair(ReaderKey key, Reader reader, int bucket, RecordIdentifier minKey, RecordIdentifier maxKey, Reader.Options options) throws IOException { super(key, reader, bucket, minKey, maxKey, options, 0); } @Override void next(OrcStruct next) throws IOException { if (recordReader.hasNext()) { long nextRowId = recordReader.getRowNumber(); // have to do initialization here, because the super's constructor // calls next and thus we need to initialize before our constructor // runs if (next == null) { nextRecord = new OrcStruct(OrcRecordUpdater.FIELDS); IntWritable operation = new IntWritable(OrcRecordUpdater.INSERT_OPERATION); nextRecord.setFieldValue(OrcRecordUpdater.OPERATION, operation); nextRecord.setFieldValue(OrcRecordUpdater.CURRENT_TRANSACTION, new LongWritable(0)); nextRecord.setFieldValue(OrcRecordUpdater.ORIGINAL_TRANSACTION, new LongWritable(0)); nextRecord.setFieldValue(OrcRecordUpdater.BUCKET, new IntWritable(bucket)); nextRecord.setFieldValue(OrcRecordUpdater.ROW_ID, new LongWritable(nextRowId)); nextRecord.setFieldValue(OrcRecordUpdater.ROW, recordReader.next(null)); } else { nextRecord = next; ((IntWritable) next.getFieldValue(OrcRecordUpdater.OPERATION)) .set(OrcRecordUpdater.INSERT_OPERATION); ((LongWritable) next.getFieldValue(OrcRecordUpdater.ORIGINAL_TRANSACTION)) .set(0); ((IntWritable) next.getFieldValue(OrcRecordUpdater.BUCKET)) .set(bucket); ((LongWritable) next.getFieldValue(OrcRecordUpdater.CURRENT_TRANSACTION)) .set(0); ((LongWritable) next.getFieldValue(OrcRecordUpdater.ROW_ID)) .set(nextRowId); nextRecord.setFieldValue(OrcRecordUpdater.ROW, recordReader.next(OrcRecordUpdater.getRow(next))); } key.setValues(0L, bucket, nextRowId, 0L, 0); if (maxKey != null && key.compareRow(maxKey) > 0) { if (LOG.isDebugEnabled()) { LOG.debug("key " + key + " > maxkey " + maxKey); } nextRecord = null; recordReader.close(); } } else { nextRecord = null; recordReader.close(); } } @Override int getColumns() { return reader.getTypes().get(0).getSubtypesCount(); } } private final TreeMap<ReaderKey, ReaderPair> readers = new TreeMap<ReaderKey, ReaderPair>(); // The reader that currently has the lowest key. private ReaderPair primary; // The key of the next lowest reader. private ReaderKey secondaryKey = null; /** * Find the key range for original bucket files. * @param reader the reader * @param bucket the bucket number we are reading * @param options the options for reading with * @throws IOException */ private void discoverOriginalKeyBounds(Reader reader, int bucket, Reader.Options options ) throws IOException { long rowLength = 0; long rowOffset = 0; long offset = options.getOffset(); long maxOffset = options.getMaxOffset(); boolean isTail = true; for(StripeInformation stripe: reader.getStripes()) { if (offset > stripe.getOffset()) { rowOffset += stripe.getNumberOfRows(); } else if (maxOffset > stripe.getOffset()) { rowLength += stripe.getNumberOfRows(); } else { isTail = false; break; } } if (rowOffset > 0) { minKey = new RecordIdentifier(0, bucket, rowOffset - 1); } if (!isTail) { maxKey = new RecordIdentifier(0, bucket, rowOffset + rowLength - 1); } } /** * Find the key range for bucket files. * @param reader the reader * @param options the options for reading with * @throws IOException */ private void discoverKeyBounds(Reader reader, Reader.Options options) throws IOException { RecordIdentifier[] keyIndex = OrcRecordUpdater.parseKeyIndex(reader); long offset = options.getOffset(); long maxOffset = options.getMaxOffset(); int firstStripe = 0; int stripeCount = 0; boolean isTail = true; List<StripeInformation> stripes = reader.getStripes(); for(StripeInformation stripe: stripes) { if (offset > stripe.getOffset()) { firstStripe += 1; } else if (maxOffset > stripe.getOffset()) { stripeCount += 1; } else { isTail = false; break; } } if (firstStripe != 0) { minKey = keyIndex[firstStripe - 1]; } if (!isTail) { maxKey = keyIndex[firstStripe + stripeCount - 1]; } } /** * Convert from the row include/sarg/columnNames to the event equivalent * for the underlying file. * @param options options for the row reader * @return a cloned options object that is modified for the event reader */ static Reader.Options createEventOptions(Reader.Options options) { Reader.Options result = options.clone(); result.range(options.getOffset(), Long.MAX_VALUE); result.include(options.getInclude()); // slide the column names down by 6 for the name array if (options.getColumnNames() != null) { String[] orig = options.getColumnNames(); String[] cols = new String[orig.length + OrcRecordUpdater.FIELDS]; for(int i=0; i < orig.length; ++i) { cols[i + OrcRecordUpdater.FIELDS] = orig[i]; } result.searchArgument(options.getSearchArgument(), cols); } return result; } /** * Create a reader that merge sorts the ACID events together. * @param conf the configuration * @param collapseEvents should the events on the same row be collapsed * @param isOriginal is the base file a pre-acid file * @param bucket the bucket we are reading * @param options the options to read with * @param deltaDirectory the list of delta directories to include * @throws IOException */ OrcRawRecordMerger(Configuration conf, boolean collapseEvents, Reader reader, boolean isOriginal, int bucket, ValidTxnList validTxnList, Reader.Options options, Path[] deltaDirectory) throws IOException { this.conf = conf; this.collapse = collapseEvents; this.offset = options.getOffset(); this.length = options.getLength(); this.validTxnList = validTxnList; TypeDescription typeDescr = OrcInputFormat.getDesiredRowTypeDescr(conf, true, Integer.MAX_VALUE); objectInspector = OrcRecordUpdater.createEventSchema (OrcStruct.createObjectInspector(0, OrcUtils.getOrcTypes(typeDescr))); // modify the options to reflect the event instead of the base row Reader.Options eventOptions = createEventOptions(options); if (reader == null) { baseReader = null; } else { // find the min/max based on the offset and length if (isOriginal) { discoverOriginalKeyBounds(reader, bucket, options); } else { discoverKeyBounds(reader, options); } LOG.info("min key = " + minKey + ", max key = " + maxKey); // use the min/max instead of the byte range ReaderPair pair; ReaderKey key = new ReaderKey(); if (isOriginal) { options = options.clone(); pair = new OriginalReaderPair(key, reader, bucket, minKey, maxKey, options); } else { pair = new ReaderPair(key, reader, bucket, minKey, maxKey, eventOptions, 0); } // if there is at least one record, put it in the map if (pair.nextRecord != null) { readers.put(key, pair); } baseReader = pair.recordReader; } // we always want to read all of the deltas eventOptions.range(0, Long.MAX_VALUE); if (deltaDirectory != null) { for(Path delta: deltaDirectory) { ReaderKey key = new ReaderKey(); Path deltaFile = AcidUtils.createBucketFile(delta, bucket); AcidUtils.ParsedDelta deltaDir = AcidUtils.parsedDelta(delta); FileSystem fs = deltaFile.getFileSystem(conf); long length = OrcAcidUtils.getLastFlushLength(fs, deltaFile); if (length != -1 && fs.exists(deltaFile)) { Reader deltaReader = OrcFile.createReader(deltaFile, OrcFile.readerOptions(conf).maxLength(length)); Reader.Options deltaEventOptions = null; if(eventOptions.getSearchArgument() != null) { // Turn off the sarg before pushing it to delta. We never want to push a sarg to a delta as // it can produce wrong results (if the latest valid version of the record is filtered out by // the sarg) or ArrayOutOfBounds errors (when the sarg is applied to a delete record) // unless the delta only has insert events AcidStats acidStats = OrcAcidUtils.parseAcidStats(deltaReader); if(acidStats.deletes > 0 || acidStats.updates > 0) { deltaEventOptions = eventOptions.clone().searchArgument(null, null); } } ReaderPair deltaPair; deltaPair = new ReaderPair(key, deltaReader, bucket, minKey, maxKey, deltaEventOptions != null ? deltaEventOptions : eventOptions, deltaDir.getStatementId()); if (deltaPair.nextRecord != null) { readers.put(key, deltaPair); } } } } // get the first record Map.Entry<ReaderKey, ReaderPair> entry = readers.pollFirstEntry(); if (entry == null) { columns = 0; primary = null; } else { primary = entry.getValue(); if (readers.isEmpty()) { secondaryKey = null; } else { secondaryKey = readers.firstKey(); } // get the number of columns in the user's rows columns = primary.getColumns(); } } @VisibleForTesting RecordIdentifier getMinKey() { return minKey; } @VisibleForTesting RecordIdentifier getMaxKey() { return maxKey; } @VisibleForTesting ReaderPair getCurrentReader() { return primary; } @VisibleForTesting Map<ReaderKey, ReaderPair> getOtherReaders() { return readers; } @Override public boolean next(RecordIdentifier recordIdentifier, OrcStruct prev) throws IOException { boolean keysSame = true; while (keysSame && primary != null) { // The primary's nextRecord is the next value to return OrcStruct current = primary.nextRecord; recordIdentifier.set(primary.key); // Advance the primary reader to the next record primary.next(extraValue); // Save the current record as the new extraValue for next time so that // we minimize allocations extraValue = current; // now that the primary reader has advanced, we need to see if we // continue to read it or move to the secondary. if (primary.nextRecord == null || primary.key.compareTo(secondaryKey) > 0) { // if the primary isn't done, push it back into the readers if (primary.nextRecord != null) { readers.put(primary.key, primary); } // update primary and secondaryKey Map.Entry<ReaderKey, ReaderPair> entry = readers.pollFirstEntry(); if (entry != null) { primary = entry.getValue(); if (readers.isEmpty()) { secondaryKey = null; } else { secondaryKey = readers.firstKey(); } } else { primary = null; } } // if this transaction isn't ok, skip over it if (!validTxnList.isTxnValid( ((ReaderKey) recordIdentifier).getCurrentTransactionId())) { continue; } /*for multi-statement txns, you may have multiple events for the same * row in the same (current) transaction. We want to collapse these to just the last one * regardless whether we are minor compacting. Consider INSERT/UPDATE/UPDATE of the * same row in the same txn. There is no benefit passing along anything except the last * event. If we did want to pass it along, we'd have to include statementId in the row * returned so that compaction could write it out or make minor minor compaction understand * how to write out delta files in delta_xxx_yyy_stid format. There doesn't seem to be any * value in this.*/ boolean isSameRow = prevKey.isSameRow((ReaderKey)recordIdentifier); // if we are collapsing, figure out if this is a new row if (collapse || isSameRow) { keysSame = (collapse && prevKey.compareRow(recordIdentifier) == 0) || (isSameRow); if (!keysSame) { prevKey.set(recordIdentifier); } } else { keysSame = false; } // set the output record by fiddling with the pointers so that we can // avoid a copy. prev.linkFields(current); } return !keysSame; } @Override public RecordIdentifier createKey() { return new ReaderKey(); } @Override public OrcStruct createValue() { return new OrcStruct(OrcRecordUpdater.FIELDS); } @Override public long getPos() throws IOException { return offset + (long)(getProgress() * length); } @Override public void close() throws IOException { if (primary != null) { primary.recordReader.close(); } for(ReaderPair pair: readers.values()) { pair.recordReader.close(); } } @Override public float getProgress() throws IOException { return baseReader == null ? 1 : baseReader.getProgress(); } @Override public ObjectInspector getObjectInspector() { return objectInspector; } @Override public boolean isDelete(OrcStruct value) { return OrcRecordUpdater.getOperation(value) == OrcRecordUpdater.DELETE_OPERATION; } /** * Get the number of columns in the underlying rows. * @return 0 if there are no base and no deltas. */ public int getColumns() { return columns; } }