/** * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.bookkeeper.bookie; import org.apache.bookkeeper.bookie.Bookie.NoLedgerException; import java.io.IOException; import java.nio.ByteBuffer; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.locks.ReentrantReadWriteLock; import java.util.concurrent.ConcurrentSkipListMap; import java.util.concurrent.TimeUnit; import org.apache.bookkeeper.stats.Counter; import org.apache.bookkeeper.stats.OpStatsLogger; import org.apache.bookkeeper.stats.StatsLogger; import org.apache.bookkeeper.util.MathUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.bookkeeper.bookie.CheckpointSource.Checkpoint; import org.apache.bookkeeper.conf.ServerConfiguration; import static org.apache.bookkeeper.bookie.BookKeeperServerStats.SKIP_LIST_FLUSH_BYTES; import static org.apache.bookkeeper.bookie.BookKeeperServerStats.SKIP_LIST_GET_ENTRY; import static org.apache.bookkeeper.bookie.BookKeeperServerStats.SKIP_LIST_PUT_ENTRY; import static org.apache.bookkeeper.bookie.BookKeeperServerStats.SKIP_LIST_SNAPSHOT; import static org.apache.bookkeeper.bookie.BookKeeperServerStats.SKIP_LIST_THROTTLING; /** * The EntryMemTable holds in-memory representation to the entries not-yet flushed. * When asked to flush, current EntrySkipList is moved to snapshot and is cleared. * We continue to serve edits out of new EntrySkipList and backing snapshot until * flusher reports in that the flush succeeded. At that point we let the snapshot go. */ public class EntryMemTable { private static Logger Logger = LoggerFactory.getLogger(Journal.class); /** * Entry skip list */ static class EntrySkipList extends ConcurrentSkipListMap<EntryKey, EntryKeyValue> { final Checkpoint cp; static final EntrySkipList EMPTY_VALUE = new EntrySkipList(Checkpoint.MAX) { @Override public boolean isEmpty() { return true; } }; EntrySkipList(final Checkpoint cp) { super(EntryKey.COMPARATOR); this.cp = cp; } int compareTo(final Checkpoint cp) { return this.cp.compareTo(cp); } @Override public EntryKeyValue put(EntryKey k, EntryKeyValue v) { return putIfAbsent(k, v); } @Override public EntryKeyValue putIfAbsent(EntryKey k, EntryKeyValue v) { assert k.equals(v); return super.putIfAbsent(v, v); } @Override public boolean equals(Object o) { return this == o; } } volatile EntrySkipList kvmap; // Snapshot of EntryMemTable. Made for flusher. volatile EntrySkipList snapshot; final ServerConfiguration conf; final CheckpointSource checkpointSource; final ReentrantReadWriteLock lock = new ReentrantReadWriteLock(); // Used to track own data size final AtomicLong size; final long skipListSizeLimit; SkipListArena allocator; // flag indicating the status of the previous flush call private final AtomicBoolean previousFlushSucceeded; private EntrySkipList newSkipList() { return new EntrySkipList(checkpointSource.newCheckpoint()); } // Stats private final OpStatsLogger snapshotStats; private final OpStatsLogger putEntryStats; private final OpStatsLogger getEntryStats; private final Counter flushBytesCounter; private final Counter throttlingCounter; /** * Constructor. * @param conf Server configuration */ public EntryMemTable(final ServerConfiguration conf, final CheckpointSource source, final StatsLogger statsLogger) { this.checkpointSource = source; this.kvmap = newSkipList(); this.snapshot = EntrySkipList.EMPTY_VALUE; this.conf = conf; this.size = new AtomicLong(0); this.allocator = new SkipListArena(conf); this.previousFlushSucceeded = new AtomicBoolean(true); // skip list size limit this.skipListSizeLimit = conf.getSkipListSizeLimit(); // Stats this.snapshotStats = statsLogger.getOpStatsLogger(SKIP_LIST_SNAPSHOT); this.putEntryStats = statsLogger.getOpStatsLogger(SKIP_LIST_PUT_ENTRY); this.getEntryStats = statsLogger.getOpStatsLogger(SKIP_LIST_GET_ENTRY); this.flushBytesCounter = statsLogger.getCounter(SKIP_LIST_FLUSH_BYTES); this.throttlingCounter = statsLogger.getCounter(SKIP_LIST_THROTTLING); } void dump() { for (EntryKey key: this.kvmap.keySet()) { Logger.info(key.toString()); } for (EntryKey key: this.snapshot.keySet()) { Logger.info(key.toString()); } } Checkpoint snapshot() throws IOException { return snapshot(Checkpoint.MAX); } /** * Snapshot current EntryMemTable. if given <i>oldCp</i> is older than current checkpoint, * we don't do any snapshot. If snapshot happened, we return the checkpoint of the snapshot. * * @param oldCp * checkpoint * @return checkpoint of the snapshot, null means no snapshot * @throws IOException */ Checkpoint snapshot(Checkpoint oldCp) throws IOException { Checkpoint cp = null; // No-op if snapshot currently has entries if (this.snapshot.isEmpty() && this.kvmap.compareTo(oldCp) < 0) { final long startTimeNanos = MathUtils.nowInNano(); this.lock.writeLock().lock(); try { if (this.snapshot.isEmpty() && !this.kvmap.isEmpty() && this.kvmap.compareTo(oldCp) < 0) { this.snapshot = this.kvmap; this.kvmap = newSkipList(); // get the checkpoint of the memtable. cp = this.kvmap.cp; // Reset heap to not include any keys this.size.set(0); // Reset allocator so we get a fresh buffer for the new EntryMemTable this.allocator = new SkipListArena(conf); } } finally { this.lock.writeLock().unlock(); } if (null != cp) { snapshotStats.registerSuccessfulEvent(MathUtils.elapsedNanos(startTimeNanos), TimeUnit.NANOSECONDS); } else { snapshotStats.registerFailedEvent(MathUtils.elapsedNanos(startTimeNanos), TimeUnit.NANOSECONDS); } } return cp; } /** * Flush snapshot and clear it. */ long flush(final SkipListFlusher flusher) throws IOException { try { long flushSize = flushSnapshot(flusher, Checkpoint.MAX); previousFlushSucceeded.set(true); return flushSize; } catch (IOException ioe) { previousFlushSucceeded.set(false); throw ioe; } } /** * Flush memtable until checkpoint. * * @param checkpoint * all data before this checkpoint need to be flushed. */ public long flush(SkipListFlusher flusher, Checkpoint checkpoint) throws IOException { try { long size = flushSnapshot(flusher, checkpoint); if (null != snapshot(checkpoint)) { size += flushSnapshot(flusher, checkpoint); } previousFlushSucceeded.set(true); return size; } catch (IOException ioe) { previousFlushSucceeded.set(false); throw ioe; } } /** * Flush snapshot and clear it iff its data is before checkpoint. * Only this function change non-empty this.snapshot. */ private long flushSnapshot(final SkipListFlusher flusher, Checkpoint checkpoint) throws IOException { long size = 0; if (this.snapshot.compareTo(checkpoint) < 0) { long ledger, ledgerGC = -1; synchronized (this) { EntrySkipList keyValues = this.snapshot; if (keyValues.compareTo(checkpoint) < 0) { for (EntryKey key : keyValues.keySet()) { EntryKeyValue kv = (EntryKeyValue)key; size += kv.getLength(); ledger = kv.getLedgerId(); if (ledgerGC != ledger) { try { flusher.process(ledger, kv.getEntryId(), kv.getValueAsByteBuffer()); } catch (NoLedgerException exception) { ledgerGC = ledger; } } } flushBytesCounter.add(size); clearSnapshot(keyValues); } } } return size; } /** * The passed snapshot was successfully persisted; it can be let go. * @param keyValues The snapshot to clean out. * @see {@link #snapshot()} */ private void clearSnapshot(final EntrySkipList keyValues) { // Caller makes sure that keyValues not empty assert !keyValues.isEmpty(); this.lock.writeLock().lock(); try { // create a new snapshot and let the old one go. assert this.snapshot == keyValues; this.snapshot = EntrySkipList.EMPTY_VALUE; } finally { this.lock.writeLock().unlock(); } } /** * Throttling writer w/ 1 ms delay */ private void throttleWriters() { try { Thread.sleep(1); } catch (InterruptedException e) { Thread.currentThread().interrupt(); } throttlingCounter.inc(); } /** * Write an update * @param entry * @return approximate size of the passed key and value. * @throws IOException */ public long addEntry(long ledgerId, long entryId, final ByteBuffer entry, final CacheCallback cb) throws IOException { long size = 0; long startTimeNanos = MathUtils.nowInNano(); boolean success = false; try { if (isSizeLimitReached() || (!previousFlushSucceeded.get())) { Checkpoint cp = snapshot(); if ((null != cp) || (!previousFlushSucceeded.get())) { cb.onSizeLimitReached(); } else { throttleWriters(); } } this.lock.readLock().lock(); try { EntryKeyValue toAdd = cloneWithAllocator(ledgerId, entryId, entry); size = internalAdd(toAdd); } finally { this.lock.readLock().unlock(); } success = true; return size; } finally { if (success) { putEntryStats.registerSuccessfulEvent(MathUtils.elapsedNanos(startTimeNanos), TimeUnit.NANOSECONDS); } else { putEntryStats.registerFailedEvent(MathUtils.elapsedNanos(startTimeNanos), TimeUnit.NANOSECONDS); } } } /** * Internal version of add() that doesn't clone KVs with the * allocator, and doesn't take the lock. * * Callers should ensure they already have the read lock taken */ private long internalAdd(final EntryKeyValue toAdd) throws IOException { long sizeChange = 0; if (kvmap.putIfAbsent(toAdd, toAdd) == null) { sizeChange = toAdd.getLength(); size.addAndGet(sizeChange); } return sizeChange; } private EntryKeyValue newEntry(long ledgerId, long entryId, final ByteBuffer entry) { byte[] buf; int offset = 0; int length = entry.remaining(); if (entry.hasArray()) { buf = entry.array(); offset = entry.arrayOffset(); } else { buf = new byte[length]; entry.get(buf); } return new EntryKeyValue(ledgerId, entryId, buf, offset, length); } private EntryKeyValue cloneWithAllocator(long ledgerId, long entryId, final ByteBuffer entry) { int len = entry.remaining(); SkipListArena.MemorySlice alloc = allocator.allocateBytes(len); if (alloc == null) { // The allocation was too large, allocator decided // not to do anything with it. return newEntry(ledgerId, entryId, entry); } assert alloc.getData() != null; entry.get(alloc.getData(), alloc.getOffset(), len); return new EntryKeyValue(ledgerId, entryId, alloc.getData(), alloc.getOffset(), len); } /** * Find the entry with given key * @param ledgerId * @param entryId * @return the entry kv or null if none found. */ public EntryKeyValue getEntry(long ledgerId, long entryId) throws IOException { EntryKey key = new EntryKey(ledgerId, entryId); EntryKeyValue value = null; long startTimeNanos = MathUtils.nowInNano(); boolean success = false; this.lock.readLock().lock(); try { value = this.kvmap.get(key); if (value == null) { value = this.snapshot.get(key); } success = true; } finally { this.lock.readLock().unlock(); if (success) { getEntryStats.registerSuccessfulEvent(MathUtils.elapsedNanos(startTimeNanos), TimeUnit.NANOSECONDS); } else { getEntryStats.registerFailedEvent(MathUtils.elapsedNanos(startTimeNanos), TimeUnit.NANOSECONDS); } } return value; } /** * Find the last entry with the given ledger key * @param ledgerId * @return the entry kv or null if none found. */ public EntryKeyValue getLastEntry(long ledgerId) throws IOException { EntryKey result = null; EntryKey key = new EntryKey(ledgerId, Long.MAX_VALUE); long startTimeNanos = MathUtils.nowInNano(); boolean success = false; this.lock.readLock().lock(); try { result = this.kvmap.floorKey(key); if (result == null || result.getLedgerId() != ledgerId) { result = this.snapshot.floorKey(key); } success = true; } finally { this.lock.readLock().unlock(); if (success) { getEntryStats.registerSuccessfulEvent(MathUtils.elapsedNanos(startTimeNanos), TimeUnit.NANOSECONDS); } else { getEntryStats.registerFailedEvent(MathUtils.elapsedNanos(startTimeNanos), TimeUnit.NANOSECONDS); } } if (result == null || result.getLedgerId() != ledgerId) { return null; } return (EntryKeyValue)result; } /** * Check if the entire heap usage for this EntryMemTable exceeds limit */ boolean isSizeLimitReached() { return size.get() >= skipListSizeLimit; } /** * Check if there is data in the mem-table * @return */ boolean isEmpty() { return size.get() == 0 && snapshot.isEmpty(); } }