/** * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.regionserver; import java.io.IOException; import java.util.LinkedList; import java.util.concurrent.atomic.AtomicLong; import org.apache.hadoop.hbase.classification.InterfaceAudience; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.ClassSize; /** * Manages the read/write consistency within memstore. This provides * an interface for readers to determine what entries to ignore, and * a mechanism for writers to obtain new write numbers, then "commit" * the new writes for readers to read (thus forming atomic transactions). */ @InterfaceAudience.Private public class MultiVersionConsistencyControl { private static final long NO_WRITE_NUMBER = 0; private volatile long memstoreRead = 0; private final Object readWaiters = new Object(); // This is the pending queue of writes. private final LinkedList<WriteEntry> writeQueue = new LinkedList<WriteEntry>(); /** * Default constructor. Initializes the memstoreRead/Write points to 0. */ public MultiVersionConsistencyControl() { } /** * Initializes the memstoreRead/Write points appropriately. * @param startPoint */ public void initialize(long startPoint) { synchronized (writeQueue) { writeQueue.clear(); memstoreRead = startPoint; } } /** * * @param initV The value we used initially and expected it'll be reset later * @return WriteEntry instance. */ WriteEntry beginMemstoreInsert() { return beginMemstoreInsertWithSeqNum(NO_WRITE_NUMBER); } /** * Get a mvcc write number before an actual one(its log sequence Id) being assigned * @param sequenceId * @return long a faked write number which is bigger enough not to be seen by others before a real * one is assigned */ public static long getPreAssignedWriteNumber(AtomicLong sequenceId) { // the 1 billion is just an arbitrary big number to guard no scanner will reach it before // current MVCC completes. Theoretically the bump only needs to be 2 * the number of handlers // because each handler could increment sequence num twice and max concurrent in-flight // transactions is the number of RPC handlers. // we can't use Long.MAX_VALUE because we still want to maintain the ordering when multiple // changes touch same row key // If for any reason, the bumped value isn't reset due to failure situations, we'll reset // curSeqNum to NO_WRITE_NUMBER in order NOT to advance memstore read point at all return sequenceId.incrementAndGet() + 1000000000; } /** * This function starts a MVCC transaction with current region's log change sequence number. Since * we set change sequence number when flushing current change to WAL(late binding), the flush * order may differ from the order to start a MVCC transaction. For example, a change begins a * MVCC firstly may complete later than a change which starts MVCC at a later time. Therefore, we * add a safe bumper to the passed in sequence number to start a MVCC so that no other concurrent * transactions will reuse the number till current MVCC completes(success or fail). The "faked" * big number is safe because we only need it to prevent current change being seen and the number * will be reset to real sequence number(set in log sync) right before we complete a MVCC in order * for MVCC to align with flush sequence. * @param curSeqNum * @return WriteEntry a WriteEntry instance with the passed in curSeqNum */ public WriteEntry beginMemstoreInsertWithSeqNum(long curSeqNum) { WriteEntry e = new WriteEntry(curSeqNum); synchronized (writeQueue) { writeQueue.add(e); return e; } } /** * Complete a {@link WriteEntry} that was created by * {@link #beginMemstoreInsertWithSeqNum(long)}. At the end of this call, the global read * point is at least as large as the write point of the passed in WriteEntry. Thus, the write is * visible to MVCC readers. * @throws IOException */ public void completeMemstoreInsertWithSeqNum(WriteEntry e, SequenceId seqId) throws IOException { if(e == null) return; if (seqId != null) { e.setWriteNumber(seqId.getSequenceId()); } else { // set the value to NO_WRITE_NUMBER in order NOT to advance memstore readpoint inside // function beginMemstoreInsertWithSeqNum in case of failures e.setWriteNumber(NO_WRITE_NUMBER); } waitForPreviousTransactionsComplete(e); } /** * Complete a {@link WriteEntry} that was created by {@link #beginMemstoreInsert()}. At the * end of this call, the global read point is at least as large as the write point of the passed * in WriteEntry. Thus, the write is visible to MVCC readers. */ public void completeMemstoreInsert(WriteEntry e) { waitForPreviousTransactionsComplete(e); } /** * Mark the {@link WriteEntry} as complete and advance the read point as * much as possible. * * How much is the read point advanced? * Let S be the set of all write numbers that are completed and where all previous write numbers * are also completed. Then, the read point is advanced to the supremum of S. * * @param e * @return true if e is visible to MVCC readers (that is, readpoint >= e.writeNumber) */ boolean advanceMemstore(WriteEntry e) { long nextReadValue = -1; synchronized (writeQueue) { e.markCompleted(); while (!writeQueue.isEmpty()) { WriteEntry queueFirst = writeQueue.getFirst(); if (queueFirst.isCompleted()) { // Using Max because Edit complete in WAL sync order not arriving order nextReadValue = Math.max(nextReadValue, queueFirst.getWriteNumber()); writeQueue.removeFirst(); } else { break; } } if (nextReadValue > memstoreRead) { memstoreRead = nextReadValue; } // notify waiters on writeQueue before return writeQueue.notifyAll(); } if (nextReadValue > 0) { synchronized (readWaiters) { readWaiters.notifyAll(); } } if (memstoreRead >= e.getWriteNumber()) { return true; } return false; } /** * Advances the current read point to be given seqNum if it is smaller than * that. */ void advanceMemstoreReadPointIfNeeded(long seqNum) { synchronized (writeQueue) { if (this.memstoreRead < seqNum) { memstoreRead = seqNum; } } } /** * Wait for all previous MVCC transactions complete */ public void waitForPreviousTransactionsComplete() { WriteEntry w = beginMemstoreInsert(); waitForPreviousTransactionsComplete(w); } public void waitForPreviousTransactionsComplete(WriteEntry waitedEntry) { boolean interrupted = false; WriteEntry w = waitedEntry; try { WriteEntry firstEntry = null; do { synchronized (writeQueue) { // writeQueue won't be empty at this point, the following is just a safety check if (writeQueue.isEmpty()) { break; } firstEntry = writeQueue.getFirst(); if (firstEntry == w) { // all previous in-flight transactions are done break; } try { writeQueue.wait(0); } catch (InterruptedException ie) { // We were interrupted... finish the loop -- i.e. cleanup --and then // on our way out, reset the interrupt flag. interrupted = true; break; } } } while (firstEntry != null); } finally { if (w != null) { advanceMemstore(w); } } if (interrupted) { Thread.currentThread().interrupt(); } } public long memstoreReadPoint() { return memstoreRead; } public static class WriteEntry { private long writeNumber; private volatile boolean completed = false; WriteEntry(long writeNumber) { this.writeNumber = writeNumber; } void markCompleted() { this.completed = true; } boolean isCompleted() { return this.completed; } long getWriteNumber() { return this.writeNumber; } void setWriteNumber(long val){ this.writeNumber = val; } } public static final long FIXED_SIZE = ClassSize.align( ClassSize.OBJECT + 2 * Bytes.SIZEOF_LONG + 2 * ClassSize.REFERENCE); }