/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.io.orc; import java.io.IOException; import java.nio.ByteBuffer; import java.nio.charset.CharacterCodingException; import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.util.ArrayList; import java.util.List; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.io.AcidOutputFormat; import org.apache.hadoop.hive.ql.io.AcidUtils; import org.apache.hadoop.hive.ql.io.RecordIdentifier; import org.apache.hadoop.hive.ql.io.RecordUpdater; import org.apache.hadoop.hive.serde2.SerDeStats; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.StructField; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.orc.OrcConf; import org.apache.orc.impl.AcidStats; import org.apache.orc.impl.OrcAcidUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.annotations.VisibleForTesting; /** * A RecordUpdater where the files are stored as ORC. */ public class OrcRecordUpdater implements RecordUpdater { private static final Logger LOG = LoggerFactory.getLogger(OrcRecordUpdater.class); public static final String ACID_KEY_INDEX_NAME = "hive.acid.key.index"; public static final String ACID_FORMAT = "_orc_acid_version"; public static final int ORC_ACID_VERSION = 0; final static int INSERT_OPERATION = 0; final static int UPDATE_OPERATION = 1; final static int DELETE_OPERATION = 2; final static int OPERATION = 0; final static int ORIGINAL_TRANSACTION = 1; final static int BUCKET = 2; final static int ROW_ID = 3; final static int CURRENT_TRANSACTION = 4; final static int ROW = 5; final static int FIELDS = 6; final static int DELTA_BUFFER_SIZE = 16 * 1024; final static long DELTA_STRIPE_SIZE = 16 * 1024 * 1024; private static final Charset UTF8 = Charset.forName("UTF-8"); private final AcidOutputFormat.Options options; private final AcidUtils.AcidOperationalProperties acidOperationalProperties; private final Path path; private Path deleteEventPath; private final FileSystem fs; private OrcFile.WriterOptions writerOptions; private Writer writer = null; private boolean writerClosed = false; private Writer deleteEventWriter = null; private final FSDataOutputStream flushLengths; private final OrcStruct item; private final IntWritable operation = new IntWritable(); private final LongWritable currentTransaction = new LongWritable(-1); private final LongWritable originalTransaction = new LongWritable(-1); private final IntWritable bucket = new IntWritable(); private final LongWritable rowId = new LongWritable(); private long insertedRows = 0; private long rowIdOffset = 0; // This records how many rows have been inserted or deleted. It is separate from insertedRows // because that is monotonically increasing to give new unique row ids. private long rowCountDelta = 0; private final KeyIndexBuilder indexBuilder = new KeyIndexBuilder(); private KeyIndexBuilder deleteEventIndexBuilder; private StructField recIdField = null; // field to look for the record identifier in private StructField rowIdField = null; // field inside recId to look for row id in private StructField originalTxnField = null; // field inside recId to look for original txn in private StructField bucketField = null; // field inside recId to look for bucket in private StructObjectInspector rowInspector; // OI for the original row private StructObjectInspector recIdInspector; // OI for the record identifier struct private LongObjectInspector rowIdInspector; // OI for the long row id inside the recordIdentifier private LongObjectInspector origTxnInspector; // OI for the original txn inside the record // identifer static int getOperation(OrcStruct struct) { return ((IntWritable) struct.getFieldValue(OPERATION)).get(); } static long getCurrentTransaction(OrcStruct struct) { return ((LongWritable) struct.getFieldValue(CURRENT_TRANSACTION)).get(); } static long getOriginalTransaction(OrcStruct struct) { return ((LongWritable) struct.getFieldValue(ORIGINAL_TRANSACTION)).get(); } static int getBucket(OrcStruct struct) { return ((IntWritable) struct.getFieldValue(BUCKET)).get(); } static long getRowId(OrcStruct struct) { return ((LongWritable) struct.getFieldValue(ROW_ID)).get(); } static OrcStruct getRow(OrcStruct struct) { if (struct == null) { return null; } else { return (OrcStruct) struct.getFieldValue(ROW); } } /** * An extension to AcidOutputFormat that allows users to add additional * options. */ public static class OrcOptions extends AcidOutputFormat.Options { OrcFile.WriterOptions orcOptions = null; public OrcOptions(Configuration conf) { super(conf); } public OrcOptions orcOptions(OrcFile.WriterOptions opts) { this.orcOptions = opts; return this; } public OrcFile.WriterOptions getOrcOptions() { return orcOptions; } } /** * Create an object inspector for the ACID event based on the object inspector * for the underlying row. * @param rowInspector the row's object inspector * @return an object inspector for the event stream */ static StructObjectInspector createEventSchema(ObjectInspector rowInspector) { List<StructField> fields = new ArrayList<StructField>(); fields.add(new OrcStruct.Field("operation", PrimitiveObjectInspectorFactory.writableIntObjectInspector, OPERATION)); fields.add(new OrcStruct.Field("originalTransaction", PrimitiveObjectInspectorFactory.writableLongObjectInspector, ORIGINAL_TRANSACTION)); fields.add(new OrcStruct.Field("bucket", PrimitiveObjectInspectorFactory.writableIntObjectInspector, BUCKET)); fields.add(new OrcStruct.Field("rowId", PrimitiveObjectInspectorFactory.writableLongObjectInspector, ROW_ID)); fields.add(new OrcStruct.Field("currentTransaction", PrimitiveObjectInspectorFactory.writableLongObjectInspector, CURRENT_TRANSACTION)); fields.add(new OrcStruct.Field("row", rowInspector, ROW)); return new OrcStruct.OrcStructInspector(fields); } OrcRecordUpdater(Path path, AcidOutputFormat.Options options) throws IOException { this.options = options; // Initialize acidOperationalProperties based on table properties, and // if they are not available, see if we can find it in the job configuration. // We have to look at these two places instead of just the conf, because Streaming Ingest // uses table properties, while normal Hive SQL inserts/updates/deletes will place this // value in the configuration object. if (options.getTableProperties() != null) { this.acidOperationalProperties = AcidUtils.getAcidOperationalProperties(options.getTableProperties()); } else { this.acidOperationalProperties = AcidUtils.getAcidOperationalProperties(options.getConfiguration()); } this.bucket.set(options.getBucket()); this.path = AcidUtils.createFilename(path, options); this.deleteEventWriter = null; this.deleteEventPath = null; FileSystem fs = options.getFilesystem(); if (fs == null) { fs = path.getFileSystem(options.getConfiguration()); } this.fs = fs; Path formatFile = new Path(path, ACID_FORMAT); if(!fs.exists(formatFile)) { try (FSDataOutputStream strm = fs.create(formatFile, false)) { strm.writeInt(ORC_ACID_VERSION); } catch (IOException ioe) { if (LOG.isDebugEnabled()) { LOG.debug("Failed to create " + path + "/" + ACID_FORMAT + " with " + ioe); } } } if (options.getMinimumTransactionId() != options.getMaximumTransactionId() && !options.isWritingBase()){ flushLengths = fs.create(OrcAcidUtils.getSideFile(this.path), true, 8, options.getReporter()); } else { flushLengths = null; } this.writerOptions = null; // If writing delta dirs, we need to make a clone of original options, to avoid polluting it for // the base writer if (options.isWritingBase()) { if (options instanceof OrcOptions) { writerOptions = ((OrcOptions) options).getOrcOptions(); } if (writerOptions == null) { writerOptions = OrcFile.writerOptions(options.getTableProperties(), options.getConfiguration()); } } else { // delta writer AcidOutputFormat.Options optionsCloneForDelta = options.clone(); if (optionsCloneForDelta instanceof OrcOptions) { writerOptions = ((OrcOptions) optionsCloneForDelta).getOrcOptions(); } if (writerOptions == null) { writerOptions = OrcFile.writerOptions(optionsCloneForDelta.getTableProperties(), optionsCloneForDelta.getConfiguration()); } if (this.acidOperationalProperties.isSplitUpdate()) { // If this is a split-update, we initialize a delete delta file path in anticipation that // they would write update/delete events to that separate file. // This writes to a file in directory which starts with "delete_delta_..." // The actual initialization of a writer only happens if any delete events are written. this.deleteEventPath = AcidUtils.createFilename(path, optionsCloneForDelta.writingDeleteDelta(true)); } // get buffer size and stripe size for base writer int baseBufferSizeValue = writerOptions.getBufferSize(); long baseStripeSizeValue = writerOptions.getStripeSize(); // overwrite buffer size and stripe size for delta writer, based on BASE_DELTA_RATIO int ratio = (int) OrcConf.BASE_DELTA_RATIO.getLong(options.getConfiguration()); writerOptions.bufferSize(baseBufferSizeValue / ratio); writerOptions.stripeSize(baseStripeSizeValue / ratio); writerOptions.blockPadding(false); } writerOptions.fileSystem(fs).callback(indexBuilder); rowInspector = (StructObjectInspector)options.getInspector(); writerOptions.inspector(createEventSchema(findRecId(options.getInspector(), options.getRecordIdColumn()))); item = new OrcStruct(FIELDS); item.setFieldValue(OPERATION, operation); item.setFieldValue(CURRENT_TRANSACTION, currentTransaction); item.setFieldValue(ORIGINAL_TRANSACTION, originalTransaction); item.setFieldValue(BUCKET, bucket); item.setFieldValue(ROW_ID, rowId); } @Override public String toString() { return getClass().getName() + "[" + path +"]"; } /** * To handle multiple INSERT... statements in a single transaction, we want to make sure * to generate unique {@code rowId} for all inserted rows of the transaction. * @return largest rowId created by previous statements (maybe 0) * @throws IOException */ private long findRowIdOffsetForInsert() throws IOException { /* * 1. need to know bucket we are writing to * 2. need to know which delta dir it's in * Then, * 1. find the same bucket file in previous (insert) delta dir for this txn * (Note: in case of split_update, we can ignore the delete_delta dirs) * 2. read the footer and get AcidStats which has insert count * 2.1 if AcidStats.inserts>0 add to the insert count. * else go to previous delta file * For example, consider insert/update/insert case...*/ if(options.getStatementId() <= 0) { return 0;//there is only 1 statement in this transaction (so far) } long totalInserts = 0; for(int pastStmt = options.getStatementId() - 1; pastStmt >= 0; pastStmt--) { Path matchingBucket = AcidUtils.createFilename(options.getFinalDestination(), options.clone().statementId(pastStmt)); if(!fs.exists(matchingBucket)) { continue; } Reader reader = OrcFile.createReader(matchingBucket, OrcFile.readerOptions(options.getConfiguration())); //no close() on Reader?! AcidStats acidStats = OrcAcidUtils.parseAcidStats(reader); if(acidStats.inserts > 0) { totalInserts += acidStats.inserts; } } return totalInserts; } // Find the record identifier column (if there) and return a possibly new ObjectInspector that // will strain out the record id for the underlying writer. private ObjectInspector findRecId(ObjectInspector inspector, int rowIdColNum) { if (!(inspector instanceof StructObjectInspector)) { throw new RuntimeException("Serious problem, expected a StructObjectInspector, but got a " + inspector.getClass().getName()); } if (rowIdColNum < 0) { return inspector; } else { RecIdStrippingObjectInspector newInspector = new RecIdStrippingObjectInspector(inspector, rowIdColNum); recIdField = newInspector.getRecId(); List<? extends StructField> fields = ((StructObjectInspector) recIdField.getFieldObjectInspector()).getAllStructFieldRefs(); // Go by position, not field name, as field names aren't guaranteed. The order of fields // in RecordIdentifier is transactionId, bucketId, rowId originalTxnField = fields.get(0); origTxnInspector = (LongObjectInspector)originalTxnField.getFieldObjectInspector(); bucketField = fields.get(1); rowIdField = fields.get(2); rowIdInspector = (LongObjectInspector)rowIdField.getFieldObjectInspector(); recIdInspector = (StructObjectInspector) recIdField.getFieldObjectInspector(); return newInspector; } } private void addSimpleEvent(int operation, long currentTransaction, long rowId, Object row) throws IOException { this.operation.set(operation); this.currentTransaction.set(currentTransaction); // If this is an insert, originalTransaction should be set to this transaction. If not, // it will be reset by the following if anyway. long originalTransaction = currentTransaction; if (operation == DELETE_OPERATION || operation == UPDATE_OPERATION) { Object rowIdValue = rowInspector.getStructFieldData(row, recIdField); originalTransaction = origTxnInspector.get( recIdInspector.getStructFieldData(rowIdValue, originalTxnField)); rowId = rowIdInspector.get(recIdInspector.getStructFieldData(rowIdValue, rowIdField)); } else if(operation == INSERT_OPERATION) { rowId += rowIdOffset; } this.rowId.set(rowId); this.originalTransaction.set(originalTransaction); item.setFieldValue(OrcRecordUpdater.OPERATION, new IntWritable(operation)); item.setFieldValue(OrcRecordUpdater.ROW, (operation == DELETE_OPERATION ? null : row)); indexBuilder.addKey(operation, originalTransaction, bucket.get(), rowId); if (writer == null) { writer = OrcFile.createWriter(path, writerOptions); } writer.addRow(item); } private void addSplitUpdateEvent(int operation, long currentTransaction, long rowId, Object row) throws IOException { if (operation == INSERT_OPERATION) { // Just insert the record in the usual way, i.e., default to the simple behavior. addSimpleEvent(operation, currentTransaction, rowId, row); return; } this.operation.set(operation); this.currentTransaction.set(currentTransaction); Object rowValue = rowInspector.getStructFieldData(row, recIdField); long originalTransaction = origTxnInspector.get( recIdInspector.getStructFieldData(rowValue, originalTxnField)); rowId = rowIdInspector.get( recIdInspector.getStructFieldData(rowValue, rowIdField)); if (operation == DELETE_OPERATION || operation == UPDATE_OPERATION) { // Initialize a deleteEventWriter if not yet done. (Lazy initialization) if (deleteEventWriter == null) { // Initialize an indexBuilder for deleteEvents. deleteEventIndexBuilder = new KeyIndexBuilder(); // Change the indexBuilder callback too for the deleteEvent file, the remaining writer // options remain the same. // TODO: When we change the callback, we are essentially mutating the writerOptions. // This works but perhaps is not a good thing. The proper way to do this would be // to clone the writerOptions, however it requires that the parent OrcFile.writerOptions // implements a clone() method (which it does not for now). HIVE-14514 is currently an open // JIRA to fix this. this.deleteEventWriter = OrcFile.createWriter(deleteEventPath, writerOptions.callback(deleteEventIndexBuilder)); } // A delete/update generates a delete event for the original row. this.rowId.set(rowId); this.originalTransaction.set(originalTransaction); item.setFieldValue(OrcRecordUpdater.OPERATION, new IntWritable(DELETE_OPERATION)); item.setFieldValue(OrcRecordUpdater.ROW, null); // ROW is null for delete events. deleteEventIndexBuilder.addKey(DELETE_OPERATION, originalTransaction, bucket.get(), rowId); deleteEventWriter.addRow(item); } if (operation == UPDATE_OPERATION) { // A new row is also inserted in the usual delta file for an update event. addSimpleEvent(INSERT_OPERATION, currentTransaction, insertedRows++, row); } } @Override public void insert(long currentTransaction, Object row) throws IOException { if (this.currentTransaction.get() != currentTransaction) { insertedRows = 0; //this method is almost no-op in hcatalog.streaming case since statementId == 0 is //always true in that case rowIdOffset = findRowIdOffsetForInsert(); } if (acidOperationalProperties.isSplitUpdate()) { addSplitUpdateEvent(INSERT_OPERATION, currentTransaction, insertedRows++, row); } else { addSimpleEvent(INSERT_OPERATION, currentTransaction, insertedRows++, row); } rowCountDelta++; } @Override public void update(long currentTransaction, Object row) throws IOException { if (this.currentTransaction.get() != currentTransaction) { insertedRows = 0; rowIdOffset = findRowIdOffsetForInsert(); } if (acidOperationalProperties.isSplitUpdate()) { addSplitUpdateEvent(UPDATE_OPERATION, currentTransaction, -1L, row); } else { addSimpleEvent(UPDATE_OPERATION, currentTransaction, -1L, row); } } @Override public void delete(long currentTransaction, Object row) throws IOException { if (this.currentTransaction.get() != currentTransaction) { insertedRows = 0; } if (acidOperationalProperties.isSplitUpdate()) { addSplitUpdateEvent(DELETE_OPERATION, currentTransaction, -1L, row); } else { addSimpleEvent(DELETE_OPERATION, currentTransaction, -1L, row); } rowCountDelta--; } @Override public void flush() throws IOException { // We only support flushes on files with multiple transactions, because // flushes create significant overhead in HDFS. Record updaters with a // single transaction should be closed rather than flushed. if (flushLengths == null) { throw new IllegalStateException("Attempting to flush a RecordUpdater on " + path + " with a single transaction."); } if (writer == null) { writer = OrcFile.createWriter(path, writerOptions); } long len = writer.writeIntermediateFooter(); flushLengths.writeLong(len); OrcInputFormat.SHIMS.hflush(flushLengths); } @Override public void close(boolean abort) throws IOException { if (abort) { if (flushLengths == null) { fs.delete(path, false); } } else if (!writerClosed) { if (acidOperationalProperties.isSplitUpdate()) { // When split-update is enabled, we can choose not to write // any delta files when there are no inserts. In such cases only the delete_deltas // would be written & they are closed separately below. if (writer != null && indexBuilder.acidStats.inserts > 0) { writer.close(); // normal close, when there are inserts. } } else { if (writer == null) { writer = OrcFile.createWriter(path, writerOptions); } writer.close(); // normal close. } if (deleteEventWriter != null) { if (deleteEventIndexBuilder.acidStats.deletes > 0) { // Only need to write out & close the delete_delta if there have been any. deleteEventWriter.close(); } else { // Just remove delete_delta, if there have been no delete events. fs.delete(deleteEventPath, false); } } } if (flushLengths != null) { flushLengths.close(); fs.delete(OrcAcidUtils.getSideFile(path), false); } writer = null; deleteEventWriter = null; writerClosed = true; } @Override public SerDeStats getStats() { SerDeStats stats = new SerDeStats(); stats.setRowCount(rowCountDelta); // Don't worry about setting raw data size diff. I have no idea how to calculate that // without finding the row we are updating or deleting, which would be a mess. return stats; } private static final Charset utf8 = Charset.forName("UTF-8"); private static final CharsetDecoder utf8Decoder = utf8.newDecoder(); static RecordIdentifier[] parseKeyIndex(Reader reader) { String[] stripes; try { ByteBuffer val = reader.getMetadataValue(OrcRecordUpdater.ACID_KEY_INDEX_NAME) .duplicate(); stripes = utf8Decoder.decode(val).toString().split(";"); } catch (CharacterCodingException e) { throw new IllegalArgumentException("Bad string encoding for " + OrcRecordUpdater.ACID_KEY_INDEX_NAME, e); } RecordIdentifier[] result = new RecordIdentifier[stripes.length]; for(int i=0; i < stripes.length; ++i) { if (stripes[i].length() != 0) { String[] parts = stripes[i].split(","); result[i] = new RecordIdentifier(); result[i].setValues(Long.parseLong(parts[0]), Integer.parseInt(parts[1]), Long.parseLong(parts[2])); } } return result; } static class KeyIndexBuilder implements OrcFile.WriterCallback { StringBuilder lastKey = new StringBuilder(); long lastTransaction; int lastBucket; long lastRowId; AcidStats acidStats = new AcidStats(); @Override public void preStripeWrite(OrcFile.WriterContext context ) throws IOException { lastKey.append(lastTransaction); lastKey.append(','); lastKey.append(lastBucket); lastKey.append(','); lastKey.append(lastRowId); lastKey.append(';'); } @Override public void preFooterWrite(OrcFile.WriterContext context ) throws IOException { context.getWriter().addUserMetadata(ACID_KEY_INDEX_NAME, UTF8.encode(lastKey.toString())); context.getWriter().addUserMetadata(OrcAcidUtils.ACID_STATS, UTF8.encode(acidStats.serialize())); } void addKey(int op, long transaction, int bucket, long rowId) { switch (op) { case INSERT_OPERATION: acidStats.inserts += 1; break; case UPDATE_OPERATION: acidStats.updates += 1; break; case DELETE_OPERATION: acidStats.deletes += 1; break; default: throw new IllegalArgumentException("Unknown operation " + op); } lastTransaction = transaction; lastBucket = bucket; lastRowId = rowId; } } /** * An ObjectInspector that will strip out the record identifier so that the underlying writer * doesn't see it. */ private static class RecIdStrippingObjectInspector extends StructObjectInspector { private StructObjectInspector wrapped; List<StructField> fields; StructField recId; RecIdStrippingObjectInspector(ObjectInspector oi, int rowIdColNum) { if (!(oi instanceof StructObjectInspector)) { throw new RuntimeException("Serious problem, expected a StructObjectInspector, " + "but got a " + oi.getClass().getName()); } wrapped = (StructObjectInspector)oi; List<? extends StructField> wrappedFields = wrapped.getAllStructFieldRefs(); fields = new ArrayList<StructField>(wrapped.getAllStructFieldRefs().size()); for (int i = 0; i < wrappedFields.size(); i++) { if (i == rowIdColNum) { recId = wrappedFields.get(i); } else { fields.add(wrappedFields.get(i)); } } } @Override public List<? extends StructField> getAllStructFieldRefs() { return fields; } @Override public StructField getStructFieldRef(String fieldName) { return wrapped.getStructFieldRef(fieldName); } @Override public Object getStructFieldData(Object data, StructField fieldRef) { // For performance don't check that that the fieldRef isn't recId everytime, // just assume that the caller used getAllStructFieldRefs and thus doesn't have that fieldRef return wrapped.getStructFieldData(data, fieldRef); } @Override public List<Object> getStructFieldsDataAsList(Object data) { return wrapped.getStructFieldsDataAsList(data); } @Override public String getTypeName() { return wrapped.getTypeName(); } @Override public Category getCategory() { return wrapped.getCategory(); } StructField getRecId() { return recId; } } }