/** * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.regionserver; import java.util.LinkedList; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.ClassSize; import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.Log; /** * Manages the read/write consistency within memstore. This provides * an interface for readers to determine what entries to ignore, and * a mechanism for writers to obtain new write numbers, then "commit" * the new writes for readers to read (thus forming atomic transactions). */ @InterfaceAudience.Private public class MultiVersionConsistencyControl { private volatile long memstoreRead = 0; private volatile long memstoreWrite = 0; private final Object readWaiters = new Object(); // This is the pending queue of writes. private final LinkedList<WriteEntry> writeQueue = new LinkedList<WriteEntry>(); private static final ThreadLocal<Long> perThreadReadPoint = new ThreadLocal<Long>() { @Override protected Long initialValue() { return Long.MAX_VALUE; } }; /** * Default constructor. Initializes the memstoreRead/Write points to 0. */ public MultiVersionConsistencyControl() { this.memstoreRead = this.memstoreWrite = 0; } /** * Initializes the memstoreRead/Write points appropriately. * @param startPoint */ public void initialize(long startPoint) { synchronized (writeQueue) { if (this.memstoreWrite != this.memstoreRead) { throw new RuntimeException("Already used this mvcc. Too late to initialize"); } this.memstoreRead = this.memstoreWrite = startPoint; } } /** * Get this thread's read point. Used primarily by the memstore scanner to * know which values to skip (ie: have not been completed/committed to * memstore). */ public static long getThreadReadPoint() { return perThreadReadPoint.get(); } /** * Set the thread read point to the given value. The thread MVCC * is used by the Memstore scanner so it knows which values to skip. * Give it a value of 0 if you want everything. */ public static void setThreadReadPoint(long readPoint) { perThreadReadPoint.set(readPoint); } /** * Set the thread MVCC read point to whatever the current read point is in * this particular instance of MVCC. Returns the new thread read point value. */ public static long resetThreadReadPoint(MultiVersionConsistencyControl mvcc) { perThreadReadPoint.set(mvcc.memstoreReadPoint()); return getThreadReadPoint(); } /** * Set the thread MVCC read point to 0 (include everything). */ public static void resetThreadReadPoint() { perThreadReadPoint.set(0L); } /** * Generate and return a {@link WriteEntry} with a new write number. * To complete the WriteEntry and wait for it to be visible, * call {@link #completeMemstoreInsert(WriteEntry)}. */ public WriteEntry beginMemstoreInsert() { synchronized (writeQueue) { long nextWriteNumber = ++memstoreWrite; WriteEntry e = new WriteEntry(nextWriteNumber); writeQueue.add(e); return e; } } /** * Complete a {@link WriteEntry} that was created by {@link #beginMemstoreInsert()}. * * At the end of this call, the global read point is at least as large as the write point * of the passed in WriteEntry. Thus, the write is visible to MVCC readers. */ public void completeMemstoreInsert(WriteEntry e) { advanceMemstore(e); waitForRead(e); } /** * Mark the {@link WriteEntry} as complete and advance the read point as * much as possible. * * How much is the read point advanced? * Let S be the set of all write numbers that are completed and where all previous write numbers * are also completed. Then, the read point is advanced to the supremum of S. * * @param e * @return true if e is visible to MVCC readers (that is, readpoint >= e.writeNumber) */ boolean advanceMemstore(WriteEntry e) { synchronized (writeQueue) { e.markCompleted(); long nextReadValue = -1; boolean ranOnce=false; while (!writeQueue.isEmpty()) { ranOnce=true; WriteEntry queueFirst = writeQueue.getFirst(); if (nextReadValue > 0) { if (nextReadValue+1 != queueFirst.getWriteNumber()) { throw new RuntimeException("invariant in completeMemstoreInsert violated, prev: " + nextReadValue + " next: " + queueFirst.getWriteNumber()); } } if (queueFirst.isCompleted()) { nextReadValue = queueFirst.getWriteNumber(); writeQueue.removeFirst(); } else { break; } } if (!ranOnce) { throw new RuntimeException("never was a first"); } if (nextReadValue > 0) { synchronized (readWaiters) { memstoreRead = nextReadValue; readWaiters.notifyAll(); } } if (memstoreRead >= e.getWriteNumber()) { return true; } return false; } } /** * Wait for the global readPoint to advance upto * the specified transaction number. */ public void waitForRead(WriteEntry e) { boolean interrupted = false; synchronized (readWaiters) { while (memstoreRead < e.getWriteNumber()) { try { readWaiters.wait(0); } catch (InterruptedException ie) { // We were interrupted... finish the loop -- i.e. cleanup --and then // on our way out, reset the interrupt flag. interrupted = true; } } } if (interrupted) Thread.currentThread().interrupt(); } public long memstoreReadPoint() { return memstoreRead; } public static class WriteEntry { private long writeNumber; private boolean completed = false; WriteEntry(long writeNumber) { this.writeNumber = writeNumber; } void markCompleted() { this.completed = true; } boolean isCompleted() { return this.completed; } long getWriteNumber() { return this.writeNumber; } } public static final long FIXED_SIZE = ClassSize.align( ClassSize.OBJECT + 2 * Bytes.SIZEOF_LONG + 2 * ClassSize.REFERENCE); }