CommitLog.java example

Explorer
apache-cassandra-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.cassandra.db.commitlog;

import java.io.*;
import java.util.*;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
import java.util.zip.CRC32;
import java.util.zip.Checksum;

import org.apache.log4j.Logger;
import org.apache.commons.lang.StringUtils;

import org.apache.cassandra.concurrent.StageManager;
import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.db.ColumnFamily;
import org.apache.cassandra.db.ColumnFamilyStore;
import org.apache.cassandra.db.FSWriteError;
import org.apache.cassandra.db.IColumn;
import org.apache.cassandra.db.RowMutation;
import org.apache.cassandra.db.Table;
import org.apache.cassandra.db.commitlog.CommitLogSegment.CommitLogContext;
import org.apache.cassandra.io.DeletionService;
import org.apache.cassandra.io.util.BufferedRandomAccessFile;
import org.apache.cassandra.io.util.FileUtils;
import org.apache.cassandra.utils.FBUtilities;
import org.apache.cassandra.utils.WrappedRunnable;
import org.apache.cassandra.utils.CLibrary;
import org.xerial.snappy.Snappy;

/*
 * Commit Log tracks every write operation into the system. The aim
 * of the commit log is to be able to successfully recover data that was
 * not stored to disk via the Memtable. Every Commit Log maintains a
 * header represented by the abstraction CommitLogHeader. The header
 * contains a bit array and an array of longs and both the arrays are
 * of size, #column families for the Table, the Commit Log represents.
 *
 * Whenever a ColumnFamily is written to, for the first time its bit flag
 * is set to one in the CommitLogHeader. When it is flushed to disk by the
 * Memtable its corresponding bit in the header is set to zero. This helps
 * track which CommitLogs can be thrown away as a result of Memtable flushes.
 * Additionally, when a ColumnFamily is flushed and written to disk, its
 * entry in the array of longs is updated with the offset in the Commit Log
 * file where it was written. This helps speed up recovery since we can seek
 * to these offsets and start processing the commit log.
 *
 * Every Commit Log is rolled over everytime it reaches its threshold in size;
 * the new log inherits the "dirty" bits from the old.
 *
 * Over time there could be a number of commit logs that would be generated.
 * To allow cleaning up non-active commit logs, whenever we flush a column family and update its bit flag in
 * the active CL, we take the dirty bit array and bitwise & it with the headers of the older logs.
 * If the result is 0, then it is safe to remove the older file.  (Since the new CL
 * inherited the old's dirty bitflags, getting a zero for any given bit in the anding
 * means that either the CF was clean in the old CL or it has been flushed since the
 * switch in the new.)
 */
public class CommitLog
{
    private static final int MAX_OUTSTANDING_REPLAY_COUNT = 1024;
    private static volatile int SEGMENT_SIZE = 128*1024*1024; // roll after log gets this big

    /**
     * File extension for compressed commit logs
     */
    static String COMPRESSION_EXTENSION = ".z";

    private static final Logger logger = Logger.getLogger(CommitLog.class);

    public static CommitLog instance()
    {
        return CLHandle.instance;
    }

    private static class CLHandle
    {
        public static final CommitLog instance = new CommitLog();
    }

    private final Deque<CommitLogSegment> segments = new ArrayDeque<CommitLogSegment>();

    public static void setSegmentSize(int size)
    {
        SEGMENT_SIZE = size;
    }

    public int getSegmentCount()
    {
        return segments.size();
    }

    private final ICommitLogExecutorService executor;

    /**
     * param @ table - name of table for which we are maintaining
     *                 this commit log.
     * param @ recoverymode - is commit log being instantiated in
     *                        in recovery mode.
    */
    private CommitLog()
    {
        // all old segments are recovered and deleted before CommitLog is instantiated.
        // All we need to do is create a new one.
        int cfSize = Table.TableMetadata.getColumnFamilyCount();
        segments.add(new CommitLogSegment(cfSize));
        
        if (DatabaseDescriptor.getCommitLogSync() == DatabaseDescriptor.CommitLogSync.periodic)
        {
            executor = new PeriodicCommitLogExecutorService();
            final Callable syncer = new Callable()
            {
                public Object call() throws Exception
                {
                    sync();
                    return null;
                }
            };

            new Thread(new Runnable()
            {
                public void run()
                {
                    while (true)
                    {
                        try
                        {
                            executor.submit(syncer).get();
                            watchMaxCommitLogs();
                            Thread.sleep(DatabaseDescriptor.getCommitLogSyncPeriod());
                        }
                        catch (InterruptedException e)
                        {
                            throw new AssertionError(e);
                        }
                        catch (ExecutionException e)
                        {
                            throw new RuntimeException(e);
                        }
                    }
                }
            }, "PERIODIC-COMMIT-LOG-SYNCER").start();
        }
        else
        {
            executor = new BatchCommitLogExecutorService();
        }
    }

    public static void recover() throws IOException
    {
        recover(Long.MAX_VALUE);
    }
    
    public static void recover(long maxReplayTimestamp) throws IOException
    {
        String directory = DatabaseDescriptor.getLogFileLocation();
        File file = new File(directory);
        File[] files = file.listFiles(new FilenameFilter()
        {
            public boolean accept(File dir, String name)
            {
                return name.matches("CommitLog-\\d+\\.log(\\.z)?");
            }
        });
        if (files.length == 0)
            return;

        Arrays.sort(files, new FileUtils.FileComparator());
        logger.info("Replaying " + StringUtils.join(files, ", "));
        recover(files,false,maxReplayTimestamp);
        
        archiveLogFiles(files);
        
        logger.info("Log replay complete");
    }

    /**
     * This is forced version of log replay. It is needed when data files 
     * are not in sync with commit log files headers, e.g. when youre replaying
     * shipped logs using file copies restored from backups.
     * @param maxReplayTimestamp 
     * 
     * @throws IOException
     */
    public static void forcedRecover(long maxReplayTimestamp) throws IOException
    {
        String directory = DatabaseDescriptor.getLogFileLocation();
        File file = new File(directory);
        File[] files = file.listFiles(new FilenameFilter()
        {
            public boolean accept(File dir, String name)
            {
                // throw out anything that starts with dot.
                return !name.matches("\\..*");
            }
        });
        if (files.length == 0)
            return;

        Arrays.sort(files, new FileUtils.FileComparator());
        logger.info("Forced replay " + StringUtils.join(files, ", "));
        recover(files,true,maxReplayTimestamp);
        
        FileUtils.delete(files);
        logger.info("Forced Log replay complete");
    }
    
    public static void recover(File[] clogs, boolean forced) throws IOException
    {
        recover(clogs,forced,Long.MAX_VALUE);
    }
    
    public static void recover(File[] clogs, boolean forced, long maxReplayTimestamp) throws IOException
    {
        Set<Table> tablesRecovered = new HashSet<Table>();
        List<Future<?>> futures = new ArrayList<Future<?>>();
        REPLAYLOOP:
        for (File file : clogs)
        {
            // empty log file - just removing it
            if (file.length()==0)
            {
                continue;
            }
            
            int bufferSize = (int)Math.min(file.length(), 32 * 1024 * 1024);
            BufferedRandomAccessFile reader = new BufferedRandomAccessFile(file.getAbsolutePath(), "r", bufferSize);

            try
            {
                CommitLogHeader clHeader = null;
                int replayPosition = 0;
                if (!forced)
                {
                    String headerPath = CommitLogHeader.getHeaderPathFromSegmentPath(file.getAbsolutePath());
                    try
                    {
                        clHeader = CommitLogHeader.readCommitLogHeader(headerPath);
                        replayPosition = CommitLogHeader.getLowestPosition(clHeader);
                    }
                    catch (IOException ioe)
                    {
                        logger.info(headerPath + " incomplete, missing or corrupt.  Everything is ok, don't panic.  CommitLog will be replayed from the beginning");
                        logger.debug("exception was", ioe);
                    }
                    
                    if (replayPosition < 0 || replayPosition > reader.length())
                    {
                        // replayPosition > reader.length() can happen if some data gets flushed before it is written to the commitlog
                        // (see https://issues.apache.org/jira/browse/CASSANDRA-2285)
                        logger.debug("skipping replay of fully-flushed "+ file);
                        continue;
                    }
                }

                /* seek to the lowest position where any CF has non-flushed data 
                 * if replay was forced - reading all records, regardless of commit log header 
                 */
                reader.seek(replayPosition);
                if (logger.isDebugEnabled())
                    logger.debug("Replaying " + file + " starting at " + replayPosition);

                if (forced)
                    logger.info("Replaying " + file + " starting at " + replayPosition);

                // assume file is compressed, if it has the special extension
                boolean logFileCompression = file.getName().endsWith(COMPRESSION_EXTENSION);
                if (logFileCompression && logger.isDebugEnabled())
                    logger.debug("Filename: " + file + " ends with \"" + COMPRESSION_EXTENSION + "\", expecting compression");

                /* read the logs populate RowMutation and apply */
                while (!reader.isEOF())
                {
                    if (logger.isDebugEnabled())
                        logger.debug("Reading mutation at " + reader.getFilePointer());

                    long claimedCRC32;
                    byte[] bytes;
                    try
                    {
                        long length = reader.readLong();
                        // RowMutation must be at LEAST 10 bytes:
                        // 3 each for a non-empty Table and Key (including the 2-byte length from writeUTF), 4 bytes for column count.
                        // This prevents CRC by being fooled by special-case garbage in the file; see CASSANDRA-2128
                        if (length < 10 || length > Integer.MAX_VALUE)
                            break;
                        bytes = new byte[(int) length]; // readlong can throw EOFException too
                        reader.readFully(bytes);
                        claimedCRC32 = reader.readLong();
                    }
                    catch (EOFException e)
                    {
                        // last CL entry didn't get completely written.  that's ok.
                        break;
                    }

                    Checksum checksum = new CRC32();
                    checksum.update(bytes, 0, bytes.length);
                    if (claimedCRC32 != checksum.getValue())
                    {
                        // this part of the log must not have been fsynced.  probably the rest is bad too,
                        // but just in case there is no harm in trying them.
                        continue;
                    }

                    // apply decompression
                    if (logFileCompression) {
                        bytes = Snappy.uncompress(bytes);
                    }

                    ByteArrayInputStream bufIn = new ByteArrayInputStream(bytes);
                    
                    /* deserialize the commit log entry */
                    final RowMutation rm = RowMutation.serializer().deserialize(new DataInputStream(bufIn));
                    
                    if (maxReplayTimestamp<Long.MAX_VALUE)
                    {
                        // inspecting mutation if it has any column with timestamp value greater than max
                        boolean timestampLimitReached=false;
                        for (ColumnFamily cf : rm.getColumnFamilies()) 
                        {
                            if (cf.isMarkedForDelete() && cf.getMarkedForDeleteAt()>maxReplayTimestamp)
                            {
                                timestampLimitReached = true;
                                break;
                            }
                            
                            for (IColumn c : cf.getSortedColumns())
                            {
                                if ( (c.isMarkedForDelete() && c.getMarkedForDeleteAt()>maxReplayTimestamp) || c.timestamp()>maxReplayTimestamp)
                                {
                                    timestampLimitReached = true;
                                    break;
                                }
                            }
                        }
                        
                        if (timestampLimitReached)
                        {
                            logger.info("Stopped replay at "+file+", position "+reader.getFilePointer()+" - mutation "+rm+" has timestamp >"+maxReplayTimestamp);
                            break REPLAYLOOP;
                        }
                        
                    }
                    
                    if (logger.isDebugEnabled())
                        logger.debug(String.format("replaying mutation for %s.%s: %s",
                                                    rm.getTable(),
                                                    rm.key(),
                                                    "{" + StringUtils.join(rm.getColumnFamilies(), ", ") + "}"));
                    final Table table = Table.open(rm.getTable());
                    tablesRecovered.add(table);
                    final Collection<ColumnFamily> columnFamilies = new ArrayList<ColumnFamily>(rm.getColumnFamilies());
                    final long entryLocation = reader.getFilePointer();
                    Runnable runnable;

                    if (forced || clHeader == null)
                    {
                        runnable = new WrappedRunnable()
                        {
                            public void runMayThrow() throws IOException
                            {
                                if (!rm.isEmpty())
                                {
                                    Table.open(rm.getTable()).apply(rm, null, false);
                                }
                            }
                        };
                    } else {
                        final CommitLogHeader finalHeader = clHeader;
                        runnable = new WrappedRunnable()
                        {
                            public void runMayThrow() throws IOException
                            {
                                /* remove column families that have already been flushed before applying the rest */
                                for (ColumnFamily columnFamily : columnFamilies)
                                {
                                    int id = table.getColumnFamilyId(columnFamily.name());
                                    if (!finalHeader.isDirty(id) || entryLocation <= finalHeader.getPosition(id))
                                    {
                                        rm.removeColumnFamily(columnFamily);
                                    }
                                }
                                if (!rm.isEmpty())
                                {
                                    Table.open(rm.getTable()).apply(rm, null, false);
                                }
                            }
                        };
                    }
                    futures.add(StageManager.getStage(StageManager.MUTATION_STAGE).submit(runnable));
                    if (futures.size() > MAX_OUTSTANDING_REPLAY_COUNT)
                    {
                        FBUtilities.waitOnFutures(futures);
                        futures.clear();
                    }
                }
            }
            finally
            {
                reader.close();
                logger.info("Finished reading " + file);
            }
        }


        // wait for all the writes to finish on the mutation stage
        FBUtilities.waitOnFutures(futures);
        logger.debug("Finished waiting on mutations from recovery");

        // flush replayed tables
        futures.clear();
        for (Table table : tablesRecovered)
            futures.addAll(table.flush());
        FBUtilities.waitOnFutures(futures);
        logger.info("Recovery complete");
    }

    private CommitLogSegment currentSegment()
    {
        return segments.getLast();
    }
    
    public Future<CommitLogContext> getContext() 
    {
        Callable<CommitLogSegment.CommitLogContext> task = new Callable<CommitLogSegment.CommitLogContext>()
        {
            public CommitLogSegment.CommitLogContext call() throws Exception
            {
                return currentSegment().getContext();
            }
        };
//        try
//        {
            return executor.submit(task);
//        }
//        catch (InterruptedException e)
//        {
//            throw new RuntimeException(e);
//        }
//        catch (ExecutionException e)
//        {
//            throw new RuntimeException(e);
//        }
    }

    /*
     * Adds the specified row to the commit log. This method will reset the
     * file offset to what it is before the start of the operation in case
     * of any problems. This way we can assume that the subsequent commit log
     * entry will override the garbage left over by the previous write.
    */
    public void add(RowMutation rowMutation, Object serializedRow) 
    {
        executor.add(new LogRecordAdder(rowMutation, serializedRow));
    }

    /*
     * This is called on Memtable flush to add to the commit log
     * a token indicating that this column family has been flushed.
     * The bit flag associated with this column family is set in the
     * header and this is used to decide if the log file can be deleted.
    */
    public void discardCompletedSegments(final String tableName, final String cf, final CommitLogSegment.CommitLogContext context) 
    {
        Callable task = new Callable()
        {
            public Object call() throws IOException
            {
                int id = Table.open(tableName).getColumnFamilyId(cf);
                discardCompletedSegmentsInternal(context, id);
                return null;
            }
        };
        try
        {
            executor.submit(task).get();
        }
        catch (InterruptedException e)
        {
            throw new RuntimeException(e);
        }
        catch (ExecutionException e)
        {
            throw new RuntimeException(e);
        }
    }

    /**
     * Delete log segments whose contents have been turned into SSTables. NOT threadsafe.
     *
     * param @ context The commitLog context .
     * param @ id id of the columnFamily being flushed to disk.
     *
    */
    private void discardCompletedSegmentsInternal(CommitLogSegment.CommitLogContext context, int id) 
    {
        if (logger.isDebugEnabled())
            logger.debug("discard completed log segments for " + context + ", column family " + id + ". CFIDs are " + Table.TableMetadata.getColumnFamilyIDString());

        /*
         * Loop through all the commit log files in the history. Now process
         * all files that are older than the one in the context. For each of
         * these files the header needs to modified by resetting the dirty
         * bit corresponding to the flushed CF.
        */
        Iterator<CommitLogSegment> iter = segments.iterator();
        while (iter.hasNext())
        {
            CommitLogSegment segment = iter.next();
            CommitLogHeader header = segment.getHeader();
            if (segment.equals(context.getSegment())) // is this segment grabbed at memtable flush ?
            {
                // Only unmark this segment if there were not write since the
                // ReplayPosition was grabbed.
                if (header.turnOffIfNotWritten(id, context.position)) {
                    maybeDiscardSegment(iter,segment,header);
                }
                
                break;
            }
            
            if (header.isDirty(id)) {
                header.turnOff(id);
    
                maybeDiscardSegment(iter, segment, header);
            }
        }
        
    }

    /**
     * For use from tests.
     */
    public Future<?> watchMaxCommitLogs()
    {
        if (DatabaseDescriptor.getMaxCommitLogSegmentsActive()>0) {
            final Callable maxCommitLogsWatcher = new Callable()
            {
                public Object call() throws Exception
                {
                    watchMaxCommitLogsInternal();
                    return null;
                }
            };


            return executor.submit(maxCommitLogsWatcher);
        }
        
        return null;
    }

    private void watchMaxCommitLogsInternal()
    {
        if (DatabaseDescriptor.getMaxCommitLogSegmentsActive()>0 && segments.size()>DatabaseDescriptor.getMaxCommitLogSegmentsActive()) {
            // trying to flush memtables, which marked dirty the oldest open segment
            CommitLogSegment oldestSegment = segments.getFirst();
            
            int cfid = oldestSegment.getHeader().getFirstDirtyCFId();
            
            assert cfid >=0 : "Commit Log Segment is clear, but open: "+oldestSegment+", header: "+oldestSegment.getHeader();
            
            final ColumnFamilyStore columnFamilyStore = Table.getColumnFamilyStore(cfid);
            
            // only submit flush, is this store is not currently flushing
            if (columnFamilyStore.getMemtablesPendingFlush().isEmpty()) {
                logger.info("Current open commit log segment count "+segments.size()+">"+DatabaseDescriptor.getMaxCommitLogSegmentsActive()+". Forcing flush of "+columnFamilyStore.columnFamily_+" to close "+oldestSegment);

                new Thread(
                        new WrappedRunnable()
                        {

                            @Override
                            protected void runMayThrow() throws Exception
                            {
                                columnFamilyStore.forceFlush();
                            }
                        },
                        "Flush submit of "+columnFamilyStore).start();
            }
        }
    }

    private void maybeDiscardSegment(Iterator<CommitLogSegment> iter,
            CommitLogSegment segment, CommitLogHeader header)
    {
        if ( header.isSafeToDelete() && segment!=currentSegment() )
        {

            if (DatabaseDescriptor.isLogArchiveActive())
            {
                logger.info("Archiving obsolete commit log:" + segment);
                segment.close();

                archiveLogfile(segment.getPath());
            }
            else
            {
                logger.info("Discarding obsolete commit log:" + segment);
                segment.close();
            }
            
            segment.submitDelete();
         
            // usually this will be the first (remaining) segment, but not always, if segment A contains
            // writes to a CF that is unflushed but is followed by segment B whose CFs are all flushed.
            iter.remove();
        }
        else
        {
            if (logger.isDebugEnabled())
                logger.debug("Not safe to delete commit log " + segment + "; dirty is " + header.dirtyString());

            try {
                segment.writeHeader();
            } catch (IOException e) {
                throw new FSWriteError(e);
            }
        }
    }
    
    private static void archiveLogFiles(File[] files) throws IOException
    {
        if (DatabaseDescriptor.isLogArchiveActive())
        {
            for (File file : files) {
                if (file.length()>0)
                {
                    logger.info("Archiving obsolete commit log:" + file);
                    archiveLogfile(file.getPath());
                }
            }

        }
        
        // MM: we dont archive .headers, because for archived files all CFs are clean
        // and anyway we'll want to replay all records in archived log
        for (File f : files)
        {
            FileUtils.delete(CommitLogHeader.getHeaderPathFromSegmentPath(f.getAbsolutePath())); // may not actually exist
            if (!f.delete())
                logger.error("Unable to remove " + f + "; you should remove it manually or next restart will replay it again (harmless, but time-consuming)");
        }

    }

    /**
     * @param oldLogFile
     */
    private static void archiveLogfile(String oldLogFile)
    {
        try {
            int lastSlash=oldLogFile.lastIndexOf( File.separator );
            String archivePath = DatabaseDescriptor.getLogArchiveDestination() + oldLogFile.substring(lastSlash);
            
            CLibrary.createHardLink(
                    new File( oldLogFile ),
                    new File( archivePath )
            );
            
            logger.info("Log file archived "+archivePath);
            
        } catch (IOException e) 
        {
            logger.warn("Cannot make hard link to "+oldLogFile, e);
        }
    }

    void sync() 
    {
        currentSegment().sync();
    }
    
    /**
     * for unit tests
     */
    public void resetUnsafe()
    {
        for (CommitLogSegment segment : segments)
            segment.close();
        segments.clear();
        int cfSize = Table.TableMetadata.getColumnFamilyCount();

        segments.add(new CommitLogSegment(cfSize));
    }

    // for tests mainly
    public int segmentsCount()
    {
        return segments.size();
    }


    public void forceNewSegment()
    {
        Callable task = new Callable()
        {
            public Object call() throws Exception
            {
                sync();
                segments.add(new CommitLogSegment(currentSegment().getHeader().getColumnFamilyCount()));
                return null;
            }
        };
        try
        {
            executor.submit(task).get();
        }
        catch (InterruptedException e)
        {
            throw new RuntimeException(e);
        }
        catch (ExecutionException e)
        {
            throw new RuntimeException(e);
        }
    }

    // TODO this should be a Runnable since it doesn't actually return anything, but it's difficult to do that
    // without breaking the fragile CheaterFutureTask in BatchCLES.
    class LogRecordAdder implements Callable, Runnable
    {
        final RowMutation rowMutation;
        final Object serializedRow;

        LogRecordAdder(RowMutation rm, Object serializedRow)
        {
            this.rowMutation = rm;
            this.serializedRow = serializedRow;
        }

        public void run()
        {
            try
            {
                currentSegment().write(rowMutation, serializedRow);
                // roll log if necessary
                if (currentSegment().length() >= SEGMENT_SIZE)
                {
                    sync();
                    segments.add(new CommitLogSegment(currentSegment().getHeader().getColumnFamilyCount()));
                }
            }
            catch (IOException e)
            {
                throw new FSWriteError(e);
            }
        }

        public Object call() throws Exception
        {
            run();
            return null;
        }
    }
}