/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.regionserver.wal; import static org.apache.hadoop.hbase.io.asyncfs.FanOutOneBlockAsyncDFSOutputHelper.shouldRetryCreate; import com.google.common.util.concurrent.ThreadFactoryBuilder; import com.lmax.disruptor.RingBuffer; import com.lmax.disruptor.Sequence; import com.lmax.disruptor.Sequencer; import io.netty.channel.EventLoop; import io.netty.util.concurrent.SingleThreadEventExecutor; import java.io.IOException; import java.io.InterruptedIOException; import java.lang.reflect.Field; import java.util.ArrayDeque; import java.util.Comparator; import java.util.Deque; import java.util.Iterator; import java.util.List; import java.util.Queue; import java.util.SortedSet; import java.util.TreeSet; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.locks.Condition; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReentrantLock; import java.util.function.Supplier; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.HBaseInterfaceAudience; import org.apache.hadoop.hbase.HRegionInfo; import org.apache.hadoop.hbase.classification.InterfaceAudience; import org.apache.hadoop.hbase.client.ConnectionUtils; import org.apache.hadoop.hbase.io.asyncfs.AsyncFSOutput; import org.apache.hadoop.hbase.io.asyncfs.FanOutOneBlockAsyncDFSOutputHelper.NameNodeException; import org.apache.hadoop.hbase.wal.AsyncFSWALProvider; import org.apache.hadoop.hbase.wal.WALKey; import org.apache.hadoop.hbase.wal.WALProvider.AsyncWriter; import org.apache.hadoop.hdfs.protocol.DatanodeInfo; import org.apache.hadoop.ipc.RemoteException; import org.apache.htrace.NullScope; import org.apache.htrace.Span; import org.apache.htrace.Trace; import org.apache.htrace.TraceScope; /** * An asynchronous implementation of FSWAL. * <p> * Here 'waitingConsumePayloads' acts as the RingBuffer in FSHLog. * <p> * For append, we process it as follow: * <ol> * <li>In the caller thread(typically, in the rpc handler thread): * <ol> * <li>Insert the entry into 'waitingConsumePayloads'. Use ringbuffer sequence as txid.</li> * <li>Schedule the consumer task if needed. See {@link #shouldScheduleConsumer()} for more details. * </li> * </ol> * </li> * <li>In the consumer task(in the EventLoop thread) * <ol> * <li>Poll the entry from {@link #waitingConsumePayloads} and insert it into * {@link #toWriteAppends}</li> * <li>Poll the entry from {@link #toWriteAppends}, append it to the AsyncWriter, and insert it into * {@link #unackedAppends}</li> * <li>If the buffered size reaches {@link #batchSize}, or there is a sync request, then we call * sync on the AsyncWriter.</li> * <li>In the callback methods: * <ul> * <li>If succeeded, poll the entry from {@link #unackedAppends} and drop it.</li> * <li>If failed, add all the entries in {@link #unackedAppends} back to {@link #toWriteAppends} and * wait for writing them again.</li> * </ul> * </li> * </ol> * </li> * </ol> * For sync, the processing stages are almost same. And different from FSHLog, we will open a new * writer and rewrite unacked entries to the new writer and sync again if we hit a sync error. * <p> * Here we only describe the logic of doReplaceWriter. The main logic of rollWriter is same with * FSHLog.<br> * For a normal roll request(for example, we have reached the log roll size): * <ol> * <li>In the log roller thread, we will set {@link #waitingRoll} to true and * {@link #readyForRolling} to false, and then wait on {@link #readyForRolling}(see * {@link #waitForSafePoint()}).</li> * <li>In the consumer thread, we will stop polling entries from {@link #waitingConsumePayloads} if * {@link #waitingRoll} is true, and also stop writing the entries in {@link #toWriteAppends} out. * </li> * <li>If there are unflush data in the writer, sync them.</li> * <li>When all out-going sync request is finished, i.e, the {@link #unackedAppends} is empty, * signal the {@link #readyForRollingCond}.</li> * <li>Back to the log roller thread, now we can confirm that there are no out-going entries, i.e., * we reach a safe point. So it is safe to replace old writer with new writer now.</li> * <li>Set {@link #writerBroken} and {@link #waitingRoll} to false, cancel log roller exit checker * if any(see the comments in the {@link #syncFailed(Throwable)} method to see why we need a checker * here).</li> * <li>Schedule the consumer task.</li> * <li>Schedule a background task to close the old writer.</li> * </ol> * For a broken writer roll request, the only difference is that we can bypass the wait for safe * point stage. See the comments in the {@link #syncFailed(Throwable)} method for more details. */ @InterfaceAudience.LimitedPrivate(HBaseInterfaceAudience.CONFIG) public class AsyncFSWAL extends AbstractFSWAL<AsyncWriter> { private static final Log LOG = LogFactory.getLog(AsyncFSWAL.class); private static final Comparator<SyncFuture> SEQ_COMPARATOR = (o1, o2) -> { int c = Long.compare(o1.getTxid(), o2.getTxid()); return c != 0 ? c : Integer.compare(System.identityHashCode(o1), System.identityHashCode(o2)); }; public static final String WAL_BATCH_SIZE = "hbase.wal.batch.size"; public static final long DEFAULT_WAL_BATCH_SIZE = 64L * 1024; public static final String ASYNC_WAL_CREATE_MAX_RETRIES = "hbase.wal.async.create.retries"; public static final int DEFAULT_ASYNC_WAL_CREATE_MAX_RETRIES = 10; private final EventLoop eventLoop; private final Lock consumeLock = new ReentrantLock(); private final Runnable consumer = this::consume; // check if there is already a consumer task in the event loop's task queue private final Supplier<Boolean> hasConsumerTask; // new writer is created and we are waiting for old writer to be closed. private volatile boolean waitingRoll; private boolean readyForRolling; private final Condition readyForRollingCond = consumeLock.newCondition(); private final RingBuffer<RingBufferTruck> waitingConsumePayloads; private final Sequence waitingConsumePayloadsGatingSequence; private final AtomicBoolean consumerScheduled = new AtomicBoolean(false); // writer is broken and rollWriter is needed. private volatile boolean writerBroken; private final long batchSize; private final int createMaxRetries; private final ExecutorService closeExecutor = Executors.newCachedThreadPool( new ThreadFactoryBuilder().setDaemon(true).setNameFormat("Close-WAL-Writer-%d").build()); private volatile AsyncFSOutput fsOut; private final Deque<FSWALEntry> toWriteAppends = new ArrayDeque<>(); private final Deque<FSWALEntry> unackedAppends = new ArrayDeque<>(); private final SortedSet<SyncFuture> syncFutures = new TreeSet<>(SEQ_COMPARATOR); // the highest txid of WAL entries being processed private long highestProcessedAppendTxid; // file length when we issue last sync request on the writer private long fileLengthAtLastSync; private long highestProcessedAppendTxidAtLastSync; public AsyncFSWAL(FileSystem fs, Path rootDir, String logDir, String archiveDir, Configuration conf, List<WALActionsListener> listeners, boolean failIfWALExists, String prefix, String suffix, EventLoop eventLoop) throws FailedLogCloseException, IOException { super(fs, rootDir, logDir, archiveDir, conf, listeners, failIfWALExists, prefix, suffix); this.eventLoop = eventLoop; Supplier<Boolean> hasConsumerTask; if (eventLoop instanceof SingleThreadEventExecutor) { try { Field field = SingleThreadEventExecutor.class.getDeclaredField("taskQueue"); field.setAccessible(true); Queue<?> queue = (Queue<?>) field.get(eventLoop); hasConsumerTask = () -> queue.peek() == consumer; } catch (Exception e) { LOG.warn("Can not get task queue of " + eventLoop + ", this is not necessary, just give up", e); hasConsumerTask = () -> false; } } else { hasConsumerTask = () -> false; } this.hasConsumerTask = hasConsumerTask; int preallocatedEventCount = this.conf.getInt("hbase.regionserver.wal.disruptor.event.count", 1024 * 16); waitingConsumePayloads = RingBuffer.createMultiProducer(RingBufferTruck::new, preallocatedEventCount); waitingConsumePayloadsGatingSequence = new Sequence(Sequencer.INITIAL_CURSOR_VALUE); waitingConsumePayloads.addGatingSequences(waitingConsumePayloadsGatingSequence); // inrease the ringbuffer sequence so our txid is start from 1 waitingConsumePayloads.publish(waitingConsumePayloads.next()); waitingConsumePayloadsGatingSequence.set(waitingConsumePayloads.getCursor()); batchSize = conf.getLong(WAL_BATCH_SIZE, DEFAULT_WAL_BATCH_SIZE); createMaxRetries = conf.getInt(ASYNC_WAL_CREATE_MAX_RETRIES, DEFAULT_ASYNC_WAL_CREATE_MAX_RETRIES); rollWriter(); } // return whether we have successfully set readyForRolling to true. private boolean trySetReadyForRolling() { // Check without holding lock first. Usually we will just return here. // waitingRoll is volatile and unacedEntries is only accessed inside event loop so it is safe to // check them outside the consumeLock. if (!waitingRoll || !unackedAppends.isEmpty()) { return false; } consumeLock.lock(); try { // 1. a roll is requested // 2. all out-going entries have been acked(we have confirmed above). if (waitingRoll) { readyForRolling = true; readyForRollingCond.signalAll(); return true; } else { return false; } } finally { consumeLock.unlock(); } } private void syncFailed(Throwable error) { LOG.warn("sync failed", error); // Here we depends on the implementation of FanOutOneBlockAsyncDFSOutput and netty. // When error occur, FanOutOneBlockAsyncDFSOutput will fail all pending flush requests. It // is execute inside EventLoop. And in DefaultPromise in netty, it will notifyListener // directly if it is already in the EventLoop thread. And in the listener method, it will // call us. So here we know that all failed flush request will call us continuously, and // before the last one finish, no other task can be executed in EventLoop. So here we are // safe to use writerBroken as a guard. // Do not forget to revisit this if we change the implementation of // FanOutOneBlockAsyncDFSOutput! consumeLock.lock(); try { if (writerBroken) { return; } writerBroken = true; if (waitingRoll) { readyForRolling = true; readyForRollingCond.signalAll(); } } finally { consumeLock.unlock(); } for (Iterator<FSWALEntry> iter = unackedAppends.descendingIterator(); iter.hasNext();) { toWriteAppends.addFirst(iter.next()); } highestUnsyncedTxid = highestSyncedTxid.get(); // request a roll. requestLogRoll(); } private void syncCompleted(AsyncWriter writer, long processedTxid, long startTimeNs) { highestSyncedTxid.set(processedTxid); for (Iterator<FSWALEntry> iter = unackedAppends.iterator(); iter.hasNext();) { if (iter.next().getTxid() <= processedTxid) { iter.remove(); } else { break; } } postSync(System.nanoTime() - startTimeNs, finishSync(true)); // Ideally, we should set a flag to indicate that the log roll has already been requested for // the current writer and give up here, and reset the flag when roll is finished. But we // finish roll in the log roller thread so the flag need to be set by different thread which // typically means we need to use a lock to protect it and do fencing. As the log roller will // aggregate the roll requests of the same WAL, so it is safe to call requestLogRoll multiple // times before the roll actual happens. But we need to stop if we set readyForRolling to true // and wake up the log roller thread waiting in waitForSafePoint as the rollWriter call may // return firstly and then we run the code below and request a roll on the new writer. if (trySetReadyForRolling()) { // we have just finished a roll, then do not need to check for log rolling, the writer will be // closed soon. return; } if (writer.getLength() < logrollsize) { return; } if (!rollWriterLock.tryLock()) { return; } try { requestLogRoll(); } finally { rollWriterLock.unlock(); } } private void sync(AsyncWriter writer) { fileLengthAtLastSync = writer.getLength(); long currentHighestProcessedAppendTxid = highestProcessedAppendTxid; highestProcessedAppendTxidAtLastSync = currentHighestProcessedAppendTxid; final long startTimeNs = System.nanoTime(); writer.sync().whenComplete((result, error) -> { if (error != null) { syncFailed(error); } else { syncCompleted(writer, currentHighestProcessedAppendTxid, startTimeNs); } }); } private void addTimeAnnotation(SyncFuture future, String annotation) { TraceScope scope = Trace.continueSpan(future.getSpan()); Trace.addTimelineAnnotation(annotation); future.setSpan(scope.detach()); } private int finishSyncLowerThanTxid(long txid, boolean addSyncTrace) { int finished = 0; for (Iterator<SyncFuture> iter = syncFutures.iterator(); iter.hasNext();) { SyncFuture sync = iter.next(); if (sync.getTxid() <= txid) { sync.done(txid, null); iter.remove(); finished++; if (addSyncTrace) { addTimeAnnotation(sync, "writer synced"); } } else { break; } } return finished; } // try advancing the highestSyncedTxid as much as possible private int finishSync(boolean addSyncTrace) { if (unackedAppends.isEmpty()) { // All outstanding appends have been acked. if (toWriteAppends.isEmpty()) { // Also no appends that wait to be written out, then just finished all pending syncs. long maxSyncTxid = highestSyncedTxid.get(); for (SyncFuture sync : syncFutures) { maxSyncTxid = Math.max(maxSyncTxid, sync.getTxid()); sync.done(maxSyncTxid, null); if (addSyncTrace) { addTimeAnnotation(sync, "writer synced"); } } highestSyncedTxid.set(maxSyncTxid); int finished = syncFutures.size(); syncFutures.clear(); return finished; } else { // There is no append between highestProcessedAppendTxid and lowestUnprocessedAppendTxid, so // if highestSyncedTxid >= highestProcessedAppendTxid, then all syncs whose txid are between // highestProcessedAppendTxid and lowestUnprocessedAppendTxid can be finished. long lowestUnprocessedAppendTxid = toWriteAppends.peek().getTxid(); assert lowestUnprocessedAppendTxid > highestProcessedAppendTxid; long doneTxid = lowestUnprocessedAppendTxid - 1; highestSyncedTxid.set(doneTxid); return finishSyncLowerThanTxid(doneTxid, addSyncTrace); } } else { // There are still unacked appends. So let's move the highestSyncedTxid to the txid of the // first unacked append minus 1. long lowestUnackedAppendTxid = unackedAppends.peek().getTxid(); long doneTxid = Math.max(lowestUnackedAppendTxid - 1, highestSyncedTxid.get()); highestSyncedTxid.set(doneTxid); return finishSyncLowerThanTxid(doneTxid, addSyncTrace); } } private void appendAndSync() { final AsyncWriter writer = this.writer; // maybe a sync request is not queued when we issue a sync, so check here to see if we could // finish some. finishSync(false); long newHighestProcessedAppendTxid = -1L; for (Iterator<FSWALEntry> iter = toWriteAppends.iterator(); iter.hasNext();) { FSWALEntry entry = iter.next(); boolean appended; Span span = entry.detachSpan(); // the span maybe null if this is a retry after rolling. if (span != null) { TraceScope scope = Trace.continueSpan(span); try { appended = append(writer, entry); } catch (IOException e) { throw new AssertionError("should not happen", e); } finally { assert scope == NullScope.INSTANCE || !scope.isDetached(); scope.close(); // append scope is complete } } else { try { appended = append(writer, entry); } catch (IOException e) { throw new AssertionError("should not happen", e); } } newHighestProcessedAppendTxid = entry.getTxid(); iter.remove(); if (appended) { unackedAppends.addLast(entry); if (writer.getLength() - fileLengthAtLastSync >= batchSize) { break; } } } // if we have a newer transaction id, update it. // otherwise, use the previous transaction id. if (newHighestProcessedAppendTxid > 0) { highestProcessedAppendTxid = newHighestProcessedAppendTxid; } else { newHighestProcessedAppendTxid = highestProcessedAppendTxid; } if (writer.getLength() - fileLengthAtLastSync >= batchSize) { // sync because buffer size limit. sync(writer); return; } if (writer.getLength() == fileLengthAtLastSync) { // we haven't written anything out, just advance the highestSyncedSequence since we may only // stamped some region sequence id. if (unackedAppends.isEmpty()) { highestSyncedTxid.set(highestProcessedAppendTxid); finishSync(false); trySetReadyForRolling(); } return; } // we have some unsynced data but haven't reached the batch size yet if (!syncFutures.isEmpty() && syncFutures.last().getTxid() > highestProcessedAppendTxidAtLastSync) { // we have at least one sync request sync(writer); } } private void consume() { consumeLock.lock(); try { if (writerBroken) { return; } if (waitingRoll) { if (writer.getLength() > fileLengthAtLastSync) { // issue a sync sync(writer); } else { if (unackedAppends.isEmpty()) { readyForRolling = true; readyForRollingCond.signalAll(); } } return; } } finally { consumeLock.unlock(); } long nextCursor = waitingConsumePayloadsGatingSequence.get() + 1; for (long cursorBound = waitingConsumePayloads.getCursor(); nextCursor <= cursorBound; nextCursor++) { if (!waitingConsumePayloads.isPublished(nextCursor)) { break; } RingBufferTruck truck = waitingConsumePayloads.get(nextCursor); switch (truck.type()) { case APPEND: toWriteAppends.addLast(truck.unloadAppend()); break; case SYNC: syncFutures.add(truck.unloadSync()); break; default: LOG.warn("RingBufferTruck with unexpected type: " + truck.type()); break; } waitingConsumePayloadsGatingSequence.set(nextCursor); } appendAndSync(); if (hasConsumerTask.get()) { return; } if (toWriteAppends.isEmpty()) { if (waitingConsumePayloadsGatingSequence.get() == waitingConsumePayloads.getCursor()) { consumerScheduled.set(false); // recheck here since in append and sync we do not hold the consumeLock. Thing may // happen like // 1. we check cursor, no new entry // 2. someone publishes a new entry to ringbuffer and the consumerScheduled is true and // give up scheduling the consumer task. // 3. we set consumerScheduled to false and also give up scheduling consumer task. if (waitingConsumePayloadsGatingSequence.get() == waitingConsumePayloads.getCursor()) { return; } else { // maybe someone has grabbed this before us if (!consumerScheduled.compareAndSet(false, true)) { return; } } } } // reschedule if we still have something to write. eventLoop.execute(consumer); } private boolean shouldScheduleConsumer() { if (writerBroken || waitingRoll) { return false; } return consumerScheduled.compareAndSet(false, true); } @Override public long append(HRegionInfo hri, WALKey key, WALEdit edits, boolean inMemstore) throws IOException { long txid = stampSequenceIdAndPublishToRingBuffer(hri, key, edits, inMemstore, waitingConsumePayloads); if (shouldScheduleConsumer()) { eventLoop.execute(consumer); } return txid; } @Override public void sync() throws IOException { TraceScope scope = Trace.startSpan("AsyncFSWAL.sync"); try { long txid = waitingConsumePayloads.next(); SyncFuture future; try { future = getSyncFuture(txid, scope.detach()); RingBufferTruck truck = waitingConsumePayloads.get(txid); truck.load(future); } finally { waitingConsumePayloads.publish(txid); } if (shouldScheduleConsumer()) { eventLoop.execute(consumer); } scope = Trace.continueSpan(blockOnSync(future)); } finally { assert scope == NullScope.INSTANCE || !scope.isDetached(); scope.close(); } } @Override public void sync(long txid) throws IOException { if (highestSyncedTxid.get() >= txid) { return; } TraceScope scope = Trace.startSpan("AsyncFSWAL.sync"); try { // here we do not use ring buffer sequence as txid long sequence = waitingConsumePayloads.next(); SyncFuture future; try { future = getSyncFuture(txid, scope.detach()); RingBufferTruck truck = waitingConsumePayloads.get(sequence); truck.load(future); } finally { waitingConsumePayloads.publish(sequence); } if (shouldScheduleConsumer()) { eventLoop.execute(consumer); } scope = Trace.continueSpan(blockOnSync(future)); } finally { assert scope == NullScope.INSTANCE || !scope.isDetached(); scope.close(); } } @Override protected AsyncWriter createWriterInstance(Path path) throws IOException { boolean overwrite = false; for (int retry = 0;; retry++) { try { return AsyncFSWALProvider.createAsyncWriter(conf, fs, path, overwrite, eventLoop); } catch (RemoteException e) { LOG.warn("create wal log writer " + path + " failed, retry = " + retry, e); if (shouldRetryCreate(e)) { if (retry >= createMaxRetries) { break; } } else { IOException ioe = e.unwrapRemoteException(); // this usually means master already think we are dead so let's fail all the pending // syncs. The shutdown process of RS will wait for all regions to be closed before calling // WAL.close so if we do not wake up the thread blocked by sync here it will cause dead // lock. if (e.getMessage().contains("Parent directory doesn't exist:")) { syncFutures.forEach(f -> f.done(f.getTxid(), ioe)); } throw ioe; } } catch (NameNodeException e) { throw e; } catch (IOException e) { LOG.warn("create wal log writer " + path + " failed, retry = " + retry, e); if (retry >= createMaxRetries) { break; } // overwrite the old broken file. overwrite = true; try { Thread.sleep(ConnectionUtils.getPauseTime(100, retry)); } catch (InterruptedException ie) { throw new InterruptedIOException(); } } } throw new IOException("Failed to create wal log writer " + path + " after retrying " + createMaxRetries + " time(s)"); } private void waitForSafePoint() { consumeLock.lock(); try { if (writerBroken || this.writer == null) { return; } consumerScheduled.set(true); waitingRoll = true; readyForRolling = false; eventLoop.execute(consumer); while (!readyForRolling) { readyForRollingCond.awaitUninterruptibly(); } } finally { consumeLock.unlock(); } } @Override protected long doReplaceWriter(Path oldPath, Path newPath, AsyncWriter nextWriter) throws IOException { waitForSafePoint(); final AsyncWriter oldWriter = this.writer; this.writer = nextWriter; if (nextWriter != null && nextWriter instanceof AsyncProtobufLogWriter) { this.fsOut = ((AsyncProtobufLogWriter) nextWriter).getOutput(); } this.fileLengthAtLastSync = 0L; this.highestProcessedAppendTxidAtLastSync = 0L; consumeLock.lock(); try { consumerScheduled.set(true); writerBroken = waitingRoll = false; eventLoop.execute(consumer); } finally { consumeLock.unlock(); } long oldFileLen; if (oldWriter != null) { oldFileLen = oldWriter.getLength(); closeExecutor.execute(() -> { try { oldWriter.close(); } catch (IOException e) { LOG.warn("close old writer failed", e); } }); } else { oldFileLen = 0L; } return oldFileLen; } @Override protected void doShutdown() throws IOException { waitForSafePoint(); this.writer.close(); this.writer = null; closeExecutor.shutdown(); IOException error = new IOException("WAL has been closed"); syncFutures.forEach(f -> f.done(f.getTxid(), error)); } @Override protected void doAppend(AsyncWriter writer, FSWALEntry entry) { writer.append(entry); } @Override DatanodeInfo[] getPipeline() { AsyncFSOutput output = this.fsOut; return output != null ? output.getPipeline() : new DatanodeInfo[0]; } @Override int getLogReplication() { return getPipeline().length; } }