/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.cassandra.db.commitlog;
import java.io.*;
import java.util.*;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
import java.util.zip.CRC32;
import java.util.zip.Checksum;
import org.apache.log4j.Logger;
import org.apache.commons.lang.StringUtils;
import org.apache.cassandra.concurrent.StageManager;
import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.db.ColumnFamily;
import org.apache.cassandra.db.ColumnFamilyStore;
import org.apache.cassandra.db.FSWriteError;
import org.apache.cassandra.db.IColumn;
import org.apache.cassandra.db.RowMutation;
import org.apache.cassandra.db.Table;
import org.apache.cassandra.db.commitlog.CommitLogSegment.CommitLogContext;
import org.apache.cassandra.io.DeletionService;
import org.apache.cassandra.io.util.BufferedRandomAccessFile;
import org.apache.cassandra.io.util.FileUtils;
import org.apache.cassandra.utils.FBUtilities;
import org.apache.cassandra.utils.WrappedRunnable;
import org.apache.cassandra.utils.CLibrary;
import org.xerial.snappy.Snappy;
/*
* Commit Log tracks every write operation into the system. The aim
* of the commit log is to be able to successfully recover data that was
* not stored to disk via the Memtable. Every Commit Log maintains a
* header represented by the abstraction CommitLogHeader. The header
* contains a bit array and an array of longs and both the arrays are
* of size, #column families for the Table, the Commit Log represents.
*
* Whenever a ColumnFamily is written to, for the first time its bit flag
* is set to one in the CommitLogHeader. When it is flushed to disk by the
* Memtable its corresponding bit in the header is set to zero. This helps
* track which CommitLogs can be thrown away as a result of Memtable flushes.
* Additionally, when a ColumnFamily is flushed and written to disk, its
* entry in the array of longs is updated with the offset in the Commit Log
* file where it was written. This helps speed up recovery since we can seek
* to these offsets and start processing the commit log.
*
* Every Commit Log is rolled over everytime it reaches its threshold in size;
* the new log inherits the "dirty" bits from the old.
*
* Over time there could be a number of commit logs that would be generated.
* To allow cleaning up non-active commit logs, whenever we flush a column family and update its bit flag in
* the active CL, we take the dirty bit array and bitwise & it with the headers of the older logs.
* If the result is 0, then it is safe to remove the older file. (Since the new CL
* inherited the old's dirty bitflags, getting a zero for any given bit in the anding
* means that either the CF was clean in the old CL or it has been flushed since the
* switch in the new.)
*/
public class CommitLog
{
private static final int MAX_OUTSTANDING_REPLAY_COUNT = 1024;
private static volatile int SEGMENT_SIZE = 128*1024*1024; // roll after log gets this big
/**
* File extension for compressed commit logs
*/
static String COMPRESSION_EXTENSION = ".z";
private static final Logger logger = Logger.getLogger(CommitLog.class);
public static CommitLog instance()
{
return CLHandle.instance;
}
private static class CLHandle
{
public static final CommitLog instance = new CommitLog();
}
private final Deque<CommitLogSegment> segments = new ArrayDeque<CommitLogSegment>();
public static void setSegmentSize(int size)
{
SEGMENT_SIZE = size;
}
public int getSegmentCount()
{
return segments.size();
}
private final ICommitLogExecutorService executor;
/**
* param @ table - name of table for which we are maintaining
* this commit log.
* param @ recoverymode - is commit log being instantiated in
* in recovery mode.
*/
private CommitLog()
{
// all old segments are recovered and deleted before CommitLog is instantiated.
// All we need to do is create a new one.
int cfSize = Table.TableMetadata.getColumnFamilyCount();
segments.add(new CommitLogSegment(cfSize));
if (DatabaseDescriptor.getCommitLogSync() == DatabaseDescriptor.CommitLogSync.periodic)
{
executor = new PeriodicCommitLogExecutorService();
final Callable syncer = new Callable()
{
public Object call() throws Exception
{
sync();
return null;
}
};
new Thread(new Runnable()
{
public void run()
{
while (true)
{
try
{
executor.submit(syncer).get();
watchMaxCommitLogs();
Thread.sleep(DatabaseDescriptor.getCommitLogSyncPeriod());
}
catch (InterruptedException e)
{
throw new AssertionError(e);
}
catch (ExecutionException e)
{
throw new RuntimeException(e);
}
}
}
}, "PERIODIC-COMMIT-LOG-SYNCER").start();
}
else
{
executor = new BatchCommitLogExecutorService();
}
}
public static void recover() throws IOException
{
recover(Long.MAX_VALUE);
}
public static void recover(long maxReplayTimestamp) throws IOException
{
String directory = DatabaseDescriptor.getLogFileLocation();
File file = new File(directory);
File[] files = file.listFiles(new FilenameFilter()
{
public boolean accept(File dir, String name)
{
return name.matches("CommitLog-\\d+\\.log(\\.z)?");
}
});
if (files.length == 0)
return;
Arrays.sort(files, new FileUtils.FileComparator());
logger.info("Replaying " + StringUtils.join(files, ", "));
recover(files,false,maxReplayTimestamp);
archiveLogFiles(files);
logger.info("Log replay complete");
}
/**
* This is forced version of log replay. It is needed when data files
* are not in sync with commit log files headers, e.g. when youre replaying
* shipped logs using file copies restored from backups.
* @param maxReplayTimestamp
*
* @throws IOException
*/
public static void forcedRecover(long maxReplayTimestamp) throws IOException
{
String directory = DatabaseDescriptor.getLogFileLocation();
File file = new File(directory);
File[] files = file.listFiles(new FilenameFilter()
{
public boolean accept(File dir, String name)
{
// throw out anything that starts with dot.
return !name.matches("\\..*");
}
});
if (files.length == 0)
return;
Arrays.sort(files, new FileUtils.FileComparator());
logger.info("Forced replay " + StringUtils.join(files, ", "));
recover(files,true,maxReplayTimestamp);
FileUtils.delete(files);
logger.info("Forced Log replay complete");
}
public static void recover(File[] clogs, boolean forced) throws IOException
{
recover(clogs,forced,Long.MAX_VALUE);
}
public static void recover(File[] clogs, boolean forced, long maxReplayTimestamp) throws IOException
{
Set<Table> tablesRecovered = new HashSet<Table>();
List<Future<?>> futures = new ArrayList<Future<?>>();
REPLAYLOOP:
for (File file : clogs)
{
// empty log file - just removing it
if (file.length()==0)
{
continue;
}
int bufferSize = (int)Math.min(file.length(), 32 * 1024 * 1024);
BufferedRandomAccessFile reader = new BufferedRandomAccessFile(file.getAbsolutePath(), "r", bufferSize);
try
{
CommitLogHeader clHeader = null;
int replayPosition = 0;
if (!forced)
{
String headerPath = CommitLogHeader.getHeaderPathFromSegmentPath(file.getAbsolutePath());
try
{
clHeader = CommitLogHeader.readCommitLogHeader(headerPath);
replayPosition = CommitLogHeader.getLowestPosition(clHeader);
}
catch (IOException ioe)
{
logger.info(headerPath + " incomplete, missing or corrupt. Everything is ok, don't panic. CommitLog will be replayed from the beginning");
logger.debug("exception was", ioe);
}
if (replayPosition < 0 || replayPosition > reader.length())
{
// replayPosition > reader.length() can happen if some data gets flushed before it is written to the commitlog
// (see https://issues.apache.org/jira/browse/CASSANDRA-2285)
logger.debug("skipping replay of fully-flushed "+ file);
continue;
}
}
/* seek to the lowest position where any CF has non-flushed data
* if replay was forced - reading all records, regardless of commit log header
*/
reader.seek(replayPosition);
if (logger.isDebugEnabled())
logger.debug("Replaying " + file + " starting at " + replayPosition);
if (forced)
logger.info("Replaying " + file + " starting at " + replayPosition);
// assume file is compressed, if it has the special extension
boolean logFileCompression = file.getName().endsWith(COMPRESSION_EXTENSION);
if (logFileCompression && logger.isDebugEnabled())
logger.debug("Filename: " + file + " ends with \"" + COMPRESSION_EXTENSION + "\", expecting compression");
/* read the logs populate RowMutation and apply */
while (!reader.isEOF())
{
if (logger.isDebugEnabled())
logger.debug("Reading mutation at " + reader.getFilePointer());
long claimedCRC32;
byte[] bytes;
try
{
long length = reader.readLong();
// RowMutation must be at LEAST 10 bytes:
// 3 each for a non-empty Table and Key (including the 2-byte length from writeUTF), 4 bytes for column count.
// This prevents CRC by being fooled by special-case garbage in the file; see CASSANDRA-2128
if (length < 10 || length > Integer.MAX_VALUE)
break;
bytes = new byte[(int) length]; // readlong can throw EOFException too
reader.readFully(bytes);
claimedCRC32 = reader.readLong();
}
catch (EOFException e)
{
// last CL entry didn't get completely written. that's ok.
break;
}
Checksum checksum = new CRC32();
checksum.update(bytes, 0, bytes.length);
if (claimedCRC32 != checksum.getValue())
{
// this part of the log must not have been fsynced. probably the rest is bad too,
// but just in case there is no harm in trying them.
continue;
}
// apply decompression
if (logFileCompression) {
bytes = Snappy.uncompress(bytes);
}
ByteArrayInputStream bufIn = new ByteArrayInputStream(bytes);
/* deserialize the commit log entry */
final RowMutation rm = RowMutation.serializer().deserialize(new DataInputStream(bufIn));
if (maxReplayTimestamp<Long.MAX_VALUE)
{
// inspecting mutation if it has any column with timestamp value greater than max
boolean timestampLimitReached=false;
for (ColumnFamily cf : rm.getColumnFamilies())
{
if (cf.isMarkedForDelete() && cf.getMarkedForDeleteAt()>maxReplayTimestamp)
{
timestampLimitReached = true;
break;
}
for (IColumn c : cf.getSortedColumns())
{
if ( (c.isMarkedForDelete() && c.getMarkedForDeleteAt()>maxReplayTimestamp) || c.timestamp()>maxReplayTimestamp)
{
timestampLimitReached = true;
break;
}
}
}
if (timestampLimitReached)
{
logger.info("Stopped replay at "+file+", position "+reader.getFilePointer()+" - mutation "+rm+" has timestamp >"+maxReplayTimestamp);
break REPLAYLOOP;
}
}
if (logger.isDebugEnabled())
logger.debug(String.format("replaying mutation for %s.%s: %s",
rm.getTable(),
rm.key(),
"{" + StringUtils.join(rm.getColumnFamilies(), ", ") + "}"));
final Table table = Table.open(rm.getTable());
tablesRecovered.add(table);
final Collection<ColumnFamily> columnFamilies = new ArrayList<ColumnFamily>(rm.getColumnFamilies());
final long entryLocation = reader.getFilePointer();
Runnable runnable;
if (forced || clHeader == null)
{
runnable = new WrappedRunnable()
{
public void runMayThrow() throws IOException
{
if (!rm.isEmpty())
{
Table.open(rm.getTable()).apply(rm, null, false);
}
}
};
} else {
final CommitLogHeader finalHeader = clHeader;
runnable = new WrappedRunnable()
{
public void runMayThrow() throws IOException
{
/* remove column families that have already been flushed before applying the rest */
for (ColumnFamily columnFamily : columnFamilies)
{
int id = table.getColumnFamilyId(columnFamily.name());
if (!finalHeader.isDirty(id) || entryLocation <= finalHeader.getPosition(id))
{
rm.removeColumnFamily(columnFamily);
}
}
if (!rm.isEmpty())
{
Table.open(rm.getTable()).apply(rm, null, false);
}
}
};
}
futures.add(StageManager.getStage(StageManager.MUTATION_STAGE).submit(runnable));
if (futures.size() > MAX_OUTSTANDING_REPLAY_COUNT)
{
FBUtilities.waitOnFutures(futures);
futures.clear();
}
}
}
finally
{
reader.close();
logger.info("Finished reading " + file);
}
}
// wait for all the writes to finish on the mutation stage
FBUtilities.waitOnFutures(futures);
logger.debug("Finished waiting on mutations from recovery");
// flush replayed tables
futures.clear();
for (Table table : tablesRecovered)
futures.addAll(table.flush());
FBUtilities.waitOnFutures(futures);
logger.info("Recovery complete");
}
private CommitLogSegment currentSegment()
{
return segments.getLast();
}
public Future<CommitLogContext> getContext()
{
Callable<CommitLogSegment.CommitLogContext> task = new Callable<CommitLogSegment.CommitLogContext>()
{
public CommitLogSegment.CommitLogContext call() throws Exception
{
return currentSegment().getContext();
}
};
// try
// {
return executor.submit(task);
// }
// catch (InterruptedException e)
// {
// throw new RuntimeException(e);
// }
// catch (ExecutionException e)
// {
// throw new RuntimeException(e);
// }
}
/*
* Adds the specified row to the commit log. This method will reset the
* file offset to what it is before the start of the operation in case
* of any problems. This way we can assume that the subsequent commit log
* entry will override the garbage left over by the previous write.
*/
public void add(RowMutation rowMutation, Object serializedRow)
{
executor.add(new LogRecordAdder(rowMutation, serializedRow));
}
/*
* This is called on Memtable flush to add to the commit log
* a token indicating that this column family has been flushed.
* The bit flag associated with this column family is set in the
* header and this is used to decide if the log file can be deleted.
*/
public void discardCompletedSegments(final String tableName, final String cf, final CommitLogSegment.CommitLogContext context)
{
Callable task = new Callable()
{
public Object call() throws IOException
{
int id = Table.open(tableName).getColumnFamilyId(cf);
discardCompletedSegmentsInternal(context, id);
return null;
}
};
try
{
executor.submit(task).get();
}
catch (InterruptedException e)
{
throw new RuntimeException(e);
}
catch (ExecutionException e)
{
throw new RuntimeException(e);
}
}
/**
* Delete log segments whose contents have been turned into SSTables. NOT threadsafe.
*
* param @ context The commitLog context .
* param @ id id of the columnFamily being flushed to disk.
*
*/
private void discardCompletedSegmentsInternal(CommitLogSegment.CommitLogContext context, int id)
{
if (logger.isDebugEnabled())
logger.debug("discard completed log segments for " + context + ", column family " + id + ". CFIDs are " + Table.TableMetadata.getColumnFamilyIDString());
/*
* Loop through all the commit log files in the history. Now process
* all files that are older than the one in the context. For each of
* these files the header needs to modified by resetting the dirty
* bit corresponding to the flushed CF.
*/
Iterator<CommitLogSegment> iter = segments.iterator();
while (iter.hasNext())
{
CommitLogSegment segment = iter.next();
CommitLogHeader header = segment.getHeader();
if (segment.equals(context.getSegment())) // is this segment grabbed at memtable flush ?
{
// Only unmark this segment if there were not write since the
// ReplayPosition was grabbed.
if (header.turnOffIfNotWritten(id, context.position)) {
maybeDiscardSegment(iter,segment,header);
}
break;
}
if (header.isDirty(id)) {
header.turnOff(id);
maybeDiscardSegment(iter, segment, header);
}
}
}
/**
* For use from tests.
*/
public Future<?> watchMaxCommitLogs()
{
if (DatabaseDescriptor.getMaxCommitLogSegmentsActive()>0) {
final Callable maxCommitLogsWatcher = new Callable()
{
public Object call() throws Exception
{
watchMaxCommitLogsInternal();
return null;
}
};
return executor.submit(maxCommitLogsWatcher);
}
return null;
}
private void watchMaxCommitLogsInternal()
{
if (DatabaseDescriptor.getMaxCommitLogSegmentsActive()>0 && segments.size()>DatabaseDescriptor.getMaxCommitLogSegmentsActive()) {
// trying to flush memtables, which marked dirty the oldest open segment
CommitLogSegment oldestSegment = segments.getFirst();
int cfid = oldestSegment.getHeader().getFirstDirtyCFId();
assert cfid >=0 : "Commit Log Segment is clear, but open: "+oldestSegment+", header: "+oldestSegment.getHeader();
final ColumnFamilyStore columnFamilyStore = Table.getColumnFamilyStore(cfid);
// only submit flush, is this store is not currently flushing
if (columnFamilyStore.getMemtablesPendingFlush().isEmpty()) {
logger.info("Current open commit log segment count "+segments.size()+">"+DatabaseDescriptor.getMaxCommitLogSegmentsActive()+". Forcing flush of "+columnFamilyStore.columnFamily_+" to close "+oldestSegment);
new Thread(
new WrappedRunnable()
{
@Override
protected void runMayThrow() throws Exception
{
columnFamilyStore.forceFlush();
}
},
"Flush submit of "+columnFamilyStore).start();
}
}
}
private void maybeDiscardSegment(Iterator<CommitLogSegment> iter,
CommitLogSegment segment, CommitLogHeader header)
{
if ( header.isSafeToDelete() && segment!=currentSegment() )
{
if (DatabaseDescriptor.isLogArchiveActive())
{
logger.info("Archiving obsolete commit log:" + segment);
segment.close();
archiveLogfile(segment.getPath());
}
else
{
logger.info("Discarding obsolete commit log:" + segment);
segment.close();
}
segment.submitDelete();
// usually this will be the first (remaining) segment, but not always, if segment A contains
// writes to a CF that is unflushed but is followed by segment B whose CFs are all flushed.
iter.remove();
}
else
{
if (logger.isDebugEnabled())
logger.debug("Not safe to delete commit log " + segment + "; dirty is " + header.dirtyString());
try {
segment.writeHeader();
} catch (IOException e) {
throw new FSWriteError(e);
}
}
}
private static void archiveLogFiles(File[] files) throws IOException
{
if (DatabaseDescriptor.isLogArchiveActive())
{
for (File file : files) {
if (file.length()>0)
{
logger.info("Archiving obsolete commit log:" + file);
archiveLogfile(file.getPath());
}
}
}
// MM: we dont archive .headers, because for archived files all CFs are clean
// and anyway we'll want to replay all records in archived log
for (File f : files)
{
FileUtils.delete(CommitLogHeader.getHeaderPathFromSegmentPath(f.getAbsolutePath())); // may not actually exist
if (!f.delete())
logger.error("Unable to remove " + f + "; you should remove it manually or next restart will replay it again (harmless, but time-consuming)");
}
}
/**
* @param oldLogFile
*/
private static void archiveLogfile(String oldLogFile)
{
try {
int lastSlash=oldLogFile.lastIndexOf( File.separator );
String archivePath = DatabaseDescriptor.getLogArchiveDestination() + oldLogFile.substring(lastSlash);
CLibrary.createHardLink(
new File( oldLogFile ),
new File( archivePath )
);
logger.info("Log file archived "+archivePath);
} catch (IOException e)
{
logger.warn("Cannot make hard link to "+oldLogFile, e);
}
}
void sync()
{
currentSegment().sync();
}
/**
* for unit tests
*/
public void resetUnsafe()
{
for (CommitLogSegment segment : segments)
segment.close();
segments.clear();
int cfSize = Table.TableMetadata.getColumnFamilyCount();
segments.add(new CommitLogSegment(cfSize));
}
// for tests mainly
public int segmentsCount()
{
return segments.size();
}
public void forceNewSegment()
{
Callable task = new Callable()
{
public Object call() throws Exception
{
sync();
segments.add(new CommitLogSegment(currentSegment().getHeader().getColumnFamilyCount()));
return null;
}
};
try
{
executor.submit(task).get();
}
catch (InterruptedException e)
{
throw new RuntimeException(e);
}
catch (ExecutionException e)
{
throw new RuntimeException(e);
}
}
// TODO this should be a Runnable since it doesn't actually return anything, but it's difficult to do that
// without breaking the fragile CheaterFutureTask in BatchCLES.
class LogRecordAdder implements Callable, Runnable
{
final RowMutation rowMutation;
final Object serializedRow;
LogRecordAdder(RowMutation rm, Object serializedRow)
{
this.rowMutation = rm;
this.serializedRow = serializedRow;
}
public void run()
{
try
{
currentSegment().write(rowMutation, serializedRow);
// roll log if necessary
if (currentSegment().length() >= SEGMENT_SIZE)
{
sync();
segments.add(new CommitLogSegment(currentSegment().getHeader().getColumnFamilyCount()));
}
}
catch (IOException e)
{
throw new FSWriteError(e);
}
}
public Object call() throws Exception
{
run();
return null;
}
}
}