/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.cassandra.db.commitlog; import java.io.File; import java.io.IOException; import java.util.*; import java.util.concurrent.Future; import java.util.concurrent.atomic.AtomicInteger; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Predicate; import com.google.common.collect.*; import org.apache.commons.lang3.StringUtils; import org.cliffc.high_scale_lib.NonBlockingHashSet; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.cassandra.concurrent.Stage; import org.apache.cassandra.concurrent.StageManager; import org.apache.cassandra.schema.TableId; import org.apache.cassandra.schema.TableMetadataRef; import org.apache.cassandra.config.Config; import org.apache.cassandra.schema.Schema; import org.apache.cassandra.schema.SchemaConstants; import org.apache.cassandra.db.ColumnFamilyStore; import org.apache.cassandra.db.Keyspace; import org.apache.cassandra.db.Mutation; import org.apache.cassandra.db.SystemKeyspace; import org.apache.cassandra.db.partitions.PartitionUpdate; import org.apache.cassandra.io.sstable.format.SSTableReader; import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.WrappedRunnable; public class CommitLogReplayer implements CommitLogReadHandler { @VisibleForTesting public static long MAX_OUTSTANDING_REPLAY_BYTES = Long.getLong("cassandra.commitlog_max_outstanding_replay_bytes", 1024 * 1024 * 64); @VisibleForTesting public static MutationInitiator mutationInitiator = new MutationInitiator(); static final String IGNORE_REPLAY_ERRORS_PROPERTY = Config.PROPERTY_PREFIX + "commitlog.ignorereplayerrors"; private static final Logger logger = LoggerFactory.getLogger(CommitLogReplayer.class); private static final int MAX_OUTSTANDING_REPLAY_COUNT = Integer.getInteger(Config.PROPERTY_PREFIX + "commitlog_max_outstanding_replay_count", 1024); private final Set<Keyspace> keyspacesReplayed; private final Queue<Future<Integer>> futures; private final AtomicInteger replayedCount; private final Map<TableId, IntervalSet<CommitLogPosition>> cfPersisted; private final CommitLogPosition globalPosition; // Used to throttle speed of replay of mutations if we pass the max outstanding count private long pendingMutationBytes = 0; private final ReplayFilter replayFilter; private final CommitLogArchiver archiver; @VisibleForTesting protected CommitLogReader commitLogReader; CommitLogReplayer(CommitLog commitLog, CommitLogPosition globalPosition, Map<TableId, IntervalSet<CommitLogPosition>> cfPersisted, ReplayFilter replayFilter) { this.keyspacesReplayed = new NonBlockingHashSet<>(); this.futures = new ArrayDeque<>(); // count the number of replayed mutation. We don't really care about atomicity, but we need it to be a reference. this.replayedCount = new AtomicInteger(); this.cfPersisted = cfPersisted; this.globalPosition = globalPosition; this.replayFilter = replayFilter; this.archiver = commitLog.archiver; this.commitLogReader = new CommitLogReader(); } public static CommitLogReplayer construct(CommitLog commitLog) { // compute per-CF and global replay intervals Map<TableId, IntervalSet<CommitLogPosition>> cfPersisted = new HashMap<>(); ReplayFilter replayFilter = ReplayFilter.create(); for (ColumnFamilyStore cfs : ColumnFamilyStore.all()) { // but, if we've truncated the cf in question, then we need to need to start replay after the truncation CommitLogPosition truncatedAt = SystemKeyspace.getTruncatedPosition(cfs.metadata.id); if (truncatedAt != null) { // Point in time restore is taken to mean that the tables need to be replayed even if they were // deleted at a later point in time. Any truncation record after that point must thus be cleared prior // to replay (CASSANDRA-9195). long restoreTime = commitLog.archiver.restorePointInTime; long truncatedTime = SystemKeyspace.getTruncatedAt(cfs.metadata.id); if (truncatedTime > restoreTime) { if (replayFilter.includes(cfs.metadata)) { logger.info("Restore point in time is before latest truncation of table {}.{}. Clearing truncation record.", cfs.metadata.keyspace, cfs.metadata.name); SystemKeyspace.removeTruncationRecord(cfs.metadata.id); truncatedAt = null; } } } IntervalSet<CommitLogPosition> filter = persistedIntervals(cfs.getLiveSSTables(), truncatedAt); cfPersisted.put(cfs.metadata.id, filter); } CommitLogPosition globalPosition = firstNotCovered(cfPersisted.values()); logger.debug("Global replay position is {} from columnfamilies {}", globalPosition, FBUtilities.toString(cfPersisted)); return new CommitLogReplayer(commitLog, globalPosition, cfPersisted, replayFilter); } public void replayPath(File file, boolean tolerateTruncation) throws IOException { commitLogReader.readCommitLogSegment(this, file, globalPosition, CommitLogReader.ALL_MUTATIONS, tolerateTruncation); } public void replayFiles(File[] clogs) throws IOException { commitLogReader.readAllFiles(this, clogs, globalPosition); } /** * Flushes all keyspaces associated with this replayer in parallel, blocking until their flushes are complete. * @return the number of mutations replayed */ public int blockForWrites() { for (Map.Entry<TableId, AtomicInteger> entry : commitLogReader.getInvalidMutations()) logger.warn("Skipped {} mutations from unknown (probably removed) CF with id {}", entry.getValue(), entry.getKey()); // wait for all the writes to finish on the mutation stage FBUtilities.waitOnFutures(futures); logger.trace("Finished waiting on mutations from recovery"); // flush replayed keyspaces futures.clear(); boolean flushingSystem = false; List<Future<?>> futures = new ArrayList<Future<?>>(); for (Keyspace keyspace : keyspacesReplayed) { if (keyspace.getName().equals(SchemaConstants.SYSTEM_KEYSPACE_NAME)) flushingSystem = true; futures.addAll(keyspace.flush()); } // also flush batchlog incase of any MV updates if (!flushingSystem) futures.add(Keyspace.open(SchemaConstants.SYSTEM_KEYSPACE_NAME).getColumnFamilyStore(SystemKeyspace.BATCHES).forceFlush()); FBUtilities.waitOnFutures(futures); return replayedCount.get(); } /* * Wrapper around initiating mutations read from the log to make it possible * to spy on initiated mutations for test */ @VisibleForTesting public static class MutationInitiator { protected Future<Integer> initiateMutation(final Mutation mutation, final long segmentId, final int serializedSize, final int entryLocation, final CommitLogReplayer commitLogReplayer) { Runnable runnable = new WrappedRunnable() { public void runMayThrow() { if (Schema.instance.getKeyspaceMetadata(mutation.getKeyspaceName()) == null) return; if (commitLogReplayer.pointInTimeExceeded(mutation)) return; final Keyspace keyspace = Keyspace.open(mutation.getKeyspaceName()); // Rebuild the mutation, omitting column families that // a) the user has requested that we ignore, // b) have already been flushed, // or c) are part of a cf that was dropped. // Keep in mind that the cf.name() is suspect. do every thing based on the cfid instead. Mutation newMutation = null; for (PartitionUpdate update : commitLogReplayer.replayFilter.filter(mutation)) { if (Schema.instance.getTableMetadata(update.metadata().id) == null) continue; // dropped // replay if current segment is newer than last flushed one or, // if it is the last known segment, if we are after the commit log segment position if (commitLogReplayer.shouldReplay(update.metadata().id, new CommitLogPosition(segmentId, entryLocation))) { if (newMutation == null) newMutation = new Mutation(mutation.getKeyspaceName(), mutation.key()); newMutation.add(update); commitLogReplayer.replayedCount.incrementAndGet(); } } if (newMutation != null) { assert !newMutation.isEmpty(); Keyspace.open(newMutation.getKeyspaceName()).apply(newMutation, false, true, false); commitLogReplayer.keyspacesReplayed.add(keyspace); } } }; return StageManager.getStage(Stage.MUTATION).submit(runnable, serializedSize); } } /** * A set of known safe-to-discard commit log replay positions, based on * the range covered by on disk sstables and those prior to the most recent truncation record */ public static IntervalSet<CommitLogPosition> persistedIntervals(Iterable<SSTableReader> onDisk, CommitLogPosition truncatedAt) { IntervalSet.Builder<CommitLogPosition> builder = new IntervalSet.Builder<>(); for (SSTableReader reader : onDisk) builder.addAll(reader.getSSTableMetadata().commitLogIntervals); if (truncatedAt != null) builder.add(CommitLogPosition.NONE, truncatedAt); return builder.build(); } /** * Find the earliest commit log position that is not covered by the known flushed ranges for some table. * * For efficiency this assumes that the first contiguously flushed interval we know of contains the moment that the * given table was constructed* and hence we can start replay from the end of that interval. * * If such an interval is not known, we must replay from the beginning. * * * This is not true only until if the very first flush of a table stalled or failed, while the second or latter * succeeded. The chances of this happening are at most very low, and if the assumption does prove to be * incorrect during replay there is little chance that the affected deployment is in production. */ public static CommitLogPosition firstNotCovered(Collection<IntervalSet<CommitLogPosition>> ranges) { return ranges.stream() .map(intervals -> Iterables.getFirst(intervals.ends(), CommitLogPosition.NONE)) .min(Ordering.natural()) .get(); // iteration is per known-CF, there must be at least one. } abstract static class ReplayFilter { public abstract Iterable<PartitionUpdate> filter(Mutation mutation); public abstract boolean includes(TableMetadataRef metadata); public static ReplayFilter create() { // If no replaylist is supplied an empty array of strings is used to replay everything. if (System.getProperty("cassandra.replayList") == null) return new AlwaysReplayFilter(); Multimap<String, String> toReplay = HashMultimap.create(); for (String rawPair : System.getProperty("cassandra.replayList").split(",")) { String[] pair = StringUtils.split(rawPair.trim(), '.'); if (pair.length != 2) throw new IllegalArgumentException("Each table to be replayed must be fully qualified with keyspace name, e.g., 'system.peers'"); Keyspace ks = Schema.instance.getKeyspaceInstance(pair[0]); if (ks == null) throw new IllegalArgumentException("Unknown keyspace " + pair[0]); ColumnFamilyStore cfs = ks.getColumnFamilyStore(pair[1]); if (cfs == null) throw new IllegalArgumentException(String.format("Unknown table %s.%s", pair[0], pair[1])); toReplay.put(pair[0], pair[1]); } return new CustomReplayFilter(toReplay); } } private static class AlwaysReplayFilter extends ReplayFilter { public Iterable<PartitionUpdate> filter(Mutation mutation) { return mutation.getPartitionUpdates(); } public boolean includes(TableMetadataRef metadata) { return true; } } private static class CustomReplayFilter extends ReplayFilter { private Multimap<String, String> toReplay; public CustomReplayFilter(Multimap<String, String> toReplay) { this.toReplay = toReplay; } public Iterable<PartitionUpdate> filter(Mutation mutation) { final Collection<String> cfNames = toReplay.get(mutation.getKeyspaceName()); if (cfNames == null) return Collections.emptySet(); return Iterables.filter(mutation.getPartitionUpdates(), new Predicate<PartitionUpdate>() { public boolean apply(PartitionUpdate upd) { return cfNames.contains(upd.metadata().name); } }); } public boolean includes(TableMetadataRef metadata) { return toReplay.containsEntry(metadata.keyspace, metadata.name); } } /** * consult the known-persisted ranges for our sstables; * if the position is covered by one of them it does not need to be replayed * * @return true iff replay is necessary */ private boolean shouldReplay(TableId tableId, CommitLogPosition position) { return !cfPersisted.get(tableId).contains(position); } protected boolean pointInTimeExceeded(Mutation fm) { long restoreTarget = archiver.restorePointInTime; for (PartitionUpdate upd : fm.getPartitionUpdates()) { if (archiver.precision.toMillis(upd.maxTimestamp()) > restoreTarget) return true; } return false; } public void handleMutation(Mutation m, int size, int entryLocation, CommitLogDescriptor desc) { pendingMutationBytes += size; futures.offer(mutationInitiator.initiateMutation(m, desc.id, size, entryLocation, this)); // If there are finished mutations, or too many outstanding bytes/mutations // drain the futures in the queue while (futures.size() > MAX_OUTSTANDING_REPLAY_COUNT || pendingMutationBytes > MAX_OUTSTANDING_REPLAY_BYTES || (!futures.isEmpty() && futures.peek().isDone())) { pendingMutationBytes -= FBUtilities.waitOnFuture(futures.poll()); } } public boolean shouldSkipSegmentOnError(CommitLogReadException exception) throws IOException { if (exception.permissible) logger.error("Ignoring commit log replay error likely due to incomplete flush to disk", exception); else if (Boolean.getBoolean(IGNORE_REPLAY_ERRORS_PROPERTY)) logger.error("Ignoring commit log replay error", exception); else if (!CommitLog.handleCommitError("Failed commit log replay", exception)) { logger.error("Replay stopped. If you wish to override this error and continue starting the node ignoring " + "commit log replay problems, specify -D" + IGNORE_REPLAY_ERRORS_PROPERTY + "=true " + "on the command line"); throw new CommitLogReplayException(exception.getMessage(), exception); } return false; } /** * The logic for whether or not we throw on an error is identical for the replayer between recoverable or non. */ public void handleUnrecoverableError(CommitLogReadException exception) throws IOException { // Don't care about return value, use this simply to throw exception as appropriate. shouldSkipSegmentOnError(exception); } @SuppressWarnings("serial") public static class CommitLogReplayException extends IOException { public CommitLogReplayException(String message, Throwable cause) { super(message, cause); } public CommitLogReplayException(String message) { super(message); } } }