/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.cassandra.db.commitlog; import java.io.File; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Set; import java.util.UUID; import java.util.concurrent.BlockingQueue; import java.util.concurrent.Callable; import java.util.concurrent.ConcurrentLinkedQueue; import java.util.concurrent.Future; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicLong; import com.google.common.collect.Iterables; import com.google.common.util.concurrent.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.config.Schema; import org.apache.cassandra.db.ColumnFamilyStore; import org.apache.cassandra.db.Keyspace; import org.apache.cassandra.db.Mutation; import org.apache.cassandra.io.util.FileUtils; import org.apache.cassandra.net.MessagingService; import org.apache.cassandra.utils.Pair; import org.apache.cassandra.utils.concurrent.WaitQueue; import org.apache.cassandra.utils.JVMStabilityInspector; import org.apache.cassandra.utils.WrappedRunnable; import static org.apache.cassandra.db.commitlog.CommitLogSegment.Allocation; /** * Performs eager-creation of commit log segments in a background thread. All the * public methods are thread safe. */ public class CommitLogSegmentManager { static final Logger logger = LoggerFactory.getLogger(CommitLogSegmentManager.class); /** * Queue of work to be done by the manager thread. This is usually a recycle operation, which returns * a CommitLogSegment, or a delete operation, which returns null. */ private final BlockingQueue<Callable<CommitLogSegment>> segmentManagementTasks = new LinkedBlockingQueue<>(); /** Segments that are ready to be used. Head of the queue is the one we allocate writes to */ private final ConcurrentLinkedQueue<CommitLogSegment> availableSegments = new ConcurrentLinkedQueue<>(); /** Active segments, containing unflushed data */ private final ConcurrentLinkedQueue<CommitLogSegment> activeSegments = new ConcurrentLinkedQueue<>(); /** The segment we are currently allocating commit log records to */ private volatile CommitLogSegment allocatingFrom = null; private final WaitQueue hasAvailableSegments = new WaitQueue(); /** * Tracks commitlog size, in multiples of the segment size. We need to do this so we can "promise" size * adjustments ahead of actually adding/freeing segments on disk, so that the "evict oldest segment" logic * can see the effect of recycling segments immediately (even though they're really happening asynchronously * on the manager thread, which will take a ms or two). */ private final AtomicLong size = new AtomicLong(); /** * New segment creation is initially disabled because we'll typically get some "free" segments * recycled after log replay. */ private volatile boolean createReserveSegments = false; private final Thread managerThread; private volatile boolean run = true; public CommitLogSegmentManager() { // The run loop for the manager thread Runnable runnable = new WrappedRunnable() { public void runMayThrow() throws Exception { while (run) { try { Callable<CommitLogSegment> task = segmentManagementTasks.poll(); if (task == null) { // if we have no more work to do, check if we should create a new segment if (availableSegments.isEmpty() && (activeSegments.isEmpty() || createReserveSegments)) { logger.debug("No segments in reserve; creating a fresh one"); size.addAndGet(DatabaseDescriptor.getCommitLogSegmentSize()); // TODO : some error handling in case we fail to create a new segment availableSegments.add(CommitLogSegment.freshSegment()); hasAvailableSegments.signalAll(); } // flush old Cfs if we're full long unused = unusedCapacity(); if (unused < 0) { List<CommitLogSegment> segmentsToRecycle = new ArrayList<>(); long spaceToReclaim = 0; for (CommitLogSegment segment : activeSegments) { if (segment == allocatingFrom) break; segmentsToRecycle.add(segment); spaceToReclaim += DatabaseDescriptor.getCommitLogSegmentSize(); if (spaceToReclaim + unused >= 0) break; } flushDataFrom(segmentsToRecycle, false); } try { // wait for new work to be provided task = segmentManagementTasks.take(); } catch (InterruptedException e) { throw new AssertionError(); } } CommitLogSegment recycled = task.call(); if (recycled != null) { // if the work resulted in a segment to recycle, publish it availableSegments.add(recycled); hasAvailableSegments.signalAll(); } } catch (Throwable t) { JVMStabilityInspector.inspectThrowable(t); if (!CommitLog.handleCommitError("Failed managing commit log segments", t)) return; // sleep some arbitrary period to avoid spamming CL Uninterruptibles.sleepUninterruptibly(1, TimeUnit.SECONDS); } } } }; managerThread = new Thread(runnable, "COMMIT-LOG-ALLOCATOR"); managerThread.start(); } /** * Reserve space in the current segment for the provided mutation or, if there isn't space available, * create a new segment. * * @return the provided Allocation object */ public Allocation allocate(Mutation mutation, int size) { CommitLogSegment segment = allocatingFrom(); Allocation alloc; while ( null == (alloc = segment.allocate(mutation, size)) ) { // failed to allocate, so move to a new segment with enough room advanceAllocatingFrom(segment); segment = allocatingFrom; } return alloc; } // simple wrapper to ensure non-null value for allocatingFrom; only necessary on first call CommitLogSegment allocatingFrom() { CommitLogSegment r = allocatingFrom; if (r == null) { advanceAllocatingFrom(null); r = allocatingFrom; } return r; } /** * Fetches a new segment from the queue, creating a new one if necessary, and activates it */ private void advanceAllocatingFrom(CommitLogSegment old) { while (true) { CommitLogSegment next; synchronized (this) { // do this in a critical section so we can atomically remove from availableSegments and add to allocatingFrom/activeSegments // see https://issues.apache.org/jira/browse/CASSANDRA-6557?focusedCommentId=13874432&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-13874432 if (allocatingFrom != old) return; next = availableSegments.poll(); if (next != null) { allocatingFrom = next; activeSegments.add(next); } } if (next != null) { if (old != null) { // Now we can run the user defined command just after switching to the new commit log. // (Do this here instead of in the recycle call so we can get a head start on the archive.) CommitLog.instance.archiver.maybeArchive(old); // ensure we don't continue to use the old file; not strictly necessary, but cleaner to enforce it old.discardUnusedTail(); } // request that the CL be synced out-of-band, as we've finished a segment CommitLog.instance.requestExtraSync(); return; } // no more segments, so register to receive a signal when not empty WaitQueue.Signal signal = hasAvailableSegments.register(CommitLog.instance.metrics.waitingOnSegmentAllocation.time()); // trigger the management thread; this must occur after registering // the signal to ensure we are woken by any new segment creation wakeManager(); // check if the queue has already been added to before waiting on the signal, to catch modifications // that happened prior to registering the signal; *then* check to see if we've been beaten to making the change if (!availableSegments.isEmpty() || allocatingFrom != old) { signal.cancel(); // if we've been beaten, just stop immediately if (allocatingFrom != old) return; // otherwise try again, as there should be an available segment continue; } // can only reach here if the queue hasn't been inserted into // before we registered the signal, as we only remove items from the queue // after updating allocatingFrom. Can safely block until we are signalled // by the allocator that new segments have been published signal.awaitUninterruptibly(); } } private void wakeManager() { // put a NO-OP on the queue, to trigger management thread (and create a new segment if necessary) segmentManagementTasks.add(new Callable<CommitLogSegment>() { public CommitLogSegment call() { return null; } }); } /** * Switch to a new segment, regardless of how much is left in the current one. * * Flushes any dirty CFs for this segment and any older segments, and then recycles * the segments */ void forceRecycleAll(Iterable<UUID> droppedCfs) { List<CommitLogSegment> segmentsToRecycle = new ArrayList<>(activeSegments); CommitLogSegment last = segmentsToRecycle.get(segmentsToRecycle.size() - 1); advanceAllocatingFrom(last); // wait for the commit log modifications last.waitForModifications(); // make sure the writes have materialized inside of the memtables by waiting for all outstanding writes // on the relevant keyspaces to complete Set<Keyspace> keyspaces = new HashSet<>(); for (UUID cfId : last.getDirtyCFIDs()) { ColumnFamilyStore cfs = Schema.instance.getColumnFamilyStoreInstance(cfId); if (cfs != null) keyspaces.add(cfs.keyspace); } for (Keyspace keyspace : keyspaces) keyspace.writeOrder.awaitNewBarrier(); // flush and wait for all CFs that are dirty in segments up-to and including 'last' Future<?> future = flushDataFrom(segmentsToRecycle, true); try { future.get(); for (CommitLogSegment segment : activeSegments) for (UUID cfId : droppedCfs) segment.markClean(cfId, segment.getContext()); // now recycle segments that are unused, as we may not have triggered a discardCompletedSegments() // if the previous active segment was the only one to recycle (since an active segment isn't // necessarily dirty, and we only call dCS after a flush). for (CommitLogSegment segment : activeSegments) if (segment.isUnused()) recycleSegment(segment); CommitLogSegment first; if ((first = activeSegments.peek()) != null && first.id <= last.id) logger.error("Failed to force-recycle all segments; at least one segment is still in use with dirty CFs."); } catch (Throwable t) { // for now just log the error and return false, indicating that we failed logger.error("Failed waiting for a forced recycle of in-use commit log segments", t); } } /** * Indicates that a segment is no longer in use and that it should be recycled. * * @param segment segment that is no longer in use */ void recycleSegment(final CommitLogSegment segment) { boolean archiveSuccess = CommitLog.instance.archiver.maybeWaitForArchiving(segment.getName()); activeSegments.remove(segment); if (!archiveSuccess) { // if archiving (command) was not successful then leave the file alone. don't delete or recycle. discardSegment(segment, false); return; } if (isCapExceeded()) { discardSegment(segment, true); return; } logger.debug("Recycling {}", segment); segmentManagementTasks.add(new Callable<CommitLogSegment>() { public CommitLogSegment call() { return segment.recycle(); } }); } /** * Differs from the above because it can work on any file instead of just existing * commit log segments managed by this manager. * * @param file segment file that is no longer in use. */ void recycleSegment(final File file) { if (isCapExceeded() || CommitLogDescriptor.fromFileName(file.getName()).getMessagingVersion() != MessagingService.current_version) { // (don't decrease managed size, since this was never a "live" segment) logger.debug("(Unopened) segment {} is no longer needed and will be deleted now", file); FileUtils.deleteWithConfirm(file); return; } logger.debug("Recycling {}", file); // this wasn't previously a live segment, so add it to the managed size when we make it live size.addAndGet(DatabaseDescriptor.getCommitLogSegmentSize()); segmentManagementTasks.add(new Callable<CommitLogSegment>() { public CommitLogSegment call() { return new CommitLogSegment(file.getPath()); } }); } /** * Indicates that a segment file should be deleted. * * @param segment segment to be discarded */ private void discardSegment(final CommitLogSegment segment, final boolean deleteFile) { logger.debug("Segment {} is no longer active and will be deleted {}", segment, deleteFile ? "now" : "by the archive script"); size.addAndGet(-DatabaseDescriptor.getCommitLogSegmentSize()); segmentManagementTasks.add(new Callable<CommitLogSegment>() { public CommitLogSegment call() { segment.close(); if (deleteFile) segment.delete(); return null; } }); } /** * @return the space (in bytes) used by all segment files. */ public long bytesUsed() { return size.get(); } /** * @param name the filename to check * @return true if file is managed by this manager. */ public boolean manages(String name) { for (CommitLogSegment segment : Iterables.concat(activeSegments, availableSegments)) if (segment.getName().equals(name)) return true; return false; } /** * Check to see if the speculative current size exceeds the cap. * * @return true if cap is exceeded */ private boolean isCapExceeded() { return unusedCapacity() < 0; } private long unusedCapacity() { long currentSize = size.get(); logger.debug("Total active commitlog segment space used is {}", currentSize); return DatabaseDescriptor.getTotalCommitlogSpaceInMB() * 1024 * 1024 - currentSize; } /** * Throws a flag that enables the behavior of keeping at least one spare segment * available at all times. */ public void enableReserveSegmentCreation() { createReserveSegments = true; wakeManager(); } /** * Force a flush on all CFs that are still dirty in @param segments. * * @return a Future that will finish when all the flushes are complete. */ private Future<?> flushDataFrom(List<CommitLogSegment> segments, boolean force) { if (segments.isEmpty()) return Futures.immediateFuture(null); final ReplayPosition maxReplayPosition = segments.get(segments.size() - 1).getContext(); // a map of CfId -> forceFlush() to ensure we only queue one flush per cf final Map<UUID, ListenableFuture<?>> flushes = new LinkedHashMap<>(); for (CommitLogSegment segment : segments) { for (UUID dirtyCFId : segment.getDirtyCFIDs()) { Pair<String,String> pair = Schema.instance.getCF(dirtyCFId); if (pair == null) { // even though we remove the schema entry before a final flush when dropping a CF, // it's still possible for a writer to race and finish his append after the flush. logger.debug("Marking clean CF {} that doesn't exist anymore", dirtyCFId); segment.markClean(dirtyCFId, segment.getContext()); } else if (!flushes.containsKey(dirtyCFId)) { String keyspace = pair.left; final ColumnFamilyStore cfs = Keyspace.open(keyspace).getColumnFamilyStore(dirtyCFId); // can safely call forceFlush here as we will only ever block (briefly) for other attempts to flush, // no deadlock possibility since switchLock removal flushes.put(dirtyCFId, force ? cfs.forceFlush() : cfs.forceFlush(maxReplayPosition)); } } } return Futures.allAsList(flushes.values()); } /** * Resets all the segments, for testing purposes. DO NOT USE THIS OUTSIDE OF TESTS. */ public void resetUnsafe() { logger.debug("Closing and clearing existing commit log segments..."); while (!segmentManagementTasks.isEmpty()) Thread.yield(); for (CommitLogSegment segment : activeSegments) segment.close(); activeSegments.clear(); for (CommitLogSegment segment : availableSegments) segment.close(); availableSegments.clear(); allocatingFrom = null; } /** * Initiates the shutdown process for the management thread. */ public void shutdown() { run = false; segmentManagementTasks.add(Callables.<CommitLogSegment>returning(null)); } /** * Returns when the management thread terminates. */ public void awaitTermination() throws InterruptedException { managerThread.join(); } /** * @return a read-only collection of the active commit log segments */ Collection<CommitLogSegment> getActiveSegments() { return Collections.unmodifiableCollection(activeSegments); } }