CommitLog.java example

Explorer
Cassandra-KVPM-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.cassandra.db.commitlog;

import java.io.*;
import java.util.*;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.zip.CRC32;
import java.util.zip.Checksum;

import com.google.common.collect.Iterables;
import com.google.common.collect.Ordering;

import org.apache.cassandra.db.*;
import org.apache.cassandra.io.sstable.SSTable;
import org.apache.cassandra.net.MessagingService;
import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.cassandra.concurrent.Stage;
import org.apache.cassandra.concurrent.StageManager;
import org.apache.cassandra.config.CFMetaData;
import org.apache.cassandra.config.Config;
import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.io.DeletionService;
import org.apache.cassandra.io.util.BufferedRandomAccessFile;
import org.apache.cassandra.io.util.FileUtils;
import org.apache.cassandra.utils.ByteBufferUtil;
import org.apache.cassandra.utils.FBUtilities;
import org.apache.cassandra.utils.WrappedRunnable;

/*
 * Commit Log tracks every write operation into the system. The aim
 * of the commit log is to be able to successfully recover data that was
 * not stored to disk via the Memtable. Every Commit Log maintains a
 * header represented by the abstraction CommitLogHeader. The header
 * contains a bit array and an array of longs and both the arrays are
 * of size, #column families for the Table, the Commit Log represents.
 *
 * Whenever a ColumnFamily is written to, for the first time its bit flag
 * is set to one in the CommitLogHeader. When it is flushed to disk by the
 * Memtable its corresponding bit in the header is set to zero. This helps
 * track which CommitLogs can be thrown away as a result of Memtable flushes.
 * Additionally, when a ColumnFamily is flushed and written to disk, its
 * entry in the array of longs is updated with the offset in the Commit Log
 * file where it was written. This helps speed up recovery since we can seek
 * to these offsets and start processing the commit log.
 *
 * Every Commit Log is rolled over everytime it reaches its threshold in size;
 * the new log inherits the "dirty" bits from the old.
 *
 * Over time there could be a number of commit logs that would be generated.
 * To allow cleaning up non-active commit logs, whenever we flush a column family and update its bit flag in
 * the active CL, we take the dirty bit array and bitwise & it with the headers of the older logs.
 * If the result is 0, then it is safe to remove the older file.  (Since the new CL
 * inherited the old's dirty bitflags, getting a zero for any given bit in the anding
 * means that either the CF was clean in the old CL or it has been flushed since the
 * switch in the new.)
 */
public class CommitLog
{
    private static final int MAX_OUTSTANDING_REPLAY_COUNT = 1024;
    
    static final Logger logger = LoggerFactory.getLogger(CommitLog.class);

    public static final CommitLog instance = new CommitLog();

    private final Deque<CommitLogSegment> segments = new ArrayDeque<CommitLogSegment>();

    private final ICommitLogExecutorService executor;
    
    private volatile int segmentSize = 128*1024*1024; // roll after log gets this big

    /**
     * param @ table - name of table for which we are maintaining
     *                 this commit log.
     * param @ recoverymode - is commit log being instantiated in
     *                        in recovery mode.
    */
    private CommitLog()
    {
        try
        {
            DatabaseDescriptor.createAllDirectories();
            segmentSize = DatabaseDescriptor.getCommitLogSegmentSize();
        }
        catch (IOException e)
        {
            throw new IOError(e);
        }

        // all old segments are recovered and deleted before CommitLog is instantiated.
        // All we need to do is create a new one.
        segments.add(new CommitLogSegment());

        executor = DatabaseDescriptor.getCommitLogSync() == Config.CommitLogSync.batch
                 ? new BatchCommitLogExecutorService()
                 : new PeriodicCommitLogExecutorService(this);
    }

    public void resetUnsafe()
    {
        segments.clear();
        segments.add(new CommitLogSegment());
    }

    private boolean manages(String name)
    {
        for (CommitLogSegment segment : segments)
        {
            if (segment.getPath().endsWith(name))
                return true;
        }
        return false;
    }

    public static int recover() throws IOException
    {
        String directory = DatabaseDescriptor.getCommitLogLocation();
        File[] files = new File(directory).listFiles(new FilenameFilter()
        {
            public boolean accept(File dir, String name)
            {
                // we used to try to avoid instantiating commitlog (thus creating an empty segment ready for writes)
                // until after recover was finished.  this turns out to be fragile; it is less error-prone to go
                // ahead and allow writes before recover(), and just skip active segments when we do.
                return CommitLogSegment.possibleCommitLogFile(name) && !instance.manages(name);
            }
        });
        if (files.length == 0)
        {
            logger.info("No commitlog files found; skipping replay");
            return 0;
        }

        Arrays.sort(files, new FileUtils.FileComparator());
        logger.info("Replaying " + StringUtils.join(files, ", "));
        int replayed = recover(files);
        for (File f : files)
        {
            if (!f.delete())
                logger.error("Unable to remove " + f + "; you should remove it manually or next restart will replay it again (harmless, but time-consuming)");
        }
        logger.info("Log replay complete, " + replayed + " replayed mutations");
        return replayed;
    }

    // returns the number of replayed mutation (useful for tests in particular)
    public static int recover(File[] clogs) throws IOException
    {
        final Set<Table> tablesRecovered = new HashSet<Table>();
        List<Future<?>> futures = new ArrayList<Future<?>>();
        byte[] bytes = new byte[4096];
        Map<Integer, AtomicInteger> invalidMutations = new HashMap<Integer, AtomicInteger>();

        // count the number of replayed mutation. We don't really care about atomicity, but we need it to be a reference.
        final AtomicInteger replayedCount = new AtomicInteger();

        // compute per-CF and global replay positions
        final Map<Integer, ReplayPosition> cfPositions = new HashMap<Integer, ReplayPosition>();
        for (ColumnFamilyStore cfs : ColumnFamilyStore.all())
        {
            // it's important to call RP.gRP per-cf, before aggregating all the positions w/ the Ordering.min call
            // below: gRP will return NONE if there are no flushed sstables, which is important to have in the
            // list (otherwise we'll just start replay from the first flush position that we do have, which is not correct).
            ReplayPosition rp = ReplayPosition.getReplayPosition(cfs.getSSTables());
            cfPositions.put(cfs.metadata.cfId, rp);
        }
        final ReplayPosition globalPosition = Ordering.from(ReplayPosition.comparator).min(cfPositions.values());

        Checksum checksum = new CRC32();
        for (final File file : clogs)
        {
            final long segment = CommitLogSegment.idFromFilename(file.getName());

            int bufferSize = (int) Math.min(Math.max(file.length(), 1), 32 * 1024 * 1024);
            BufferedRandomAccessFile reader = new BufferedRandomAccessFile(new File(file.getAbsolutePath()), "r", bufferSize, true);
            assert reader.length() <= Integer.MAX_VALUE;

            try
            {
                int replayPosition;
                if (globalPosition.segment < segment)
                    replayPosition = 0;
                else if (globalPosition.segment == segment)
                    replayPosition = globalPosition.position;
                else
                    replayPosition = (int) reader.length();

                if (replayPosition < 0 || replayPosition >= reader.length())
                {
                    // replayPosition > reader.length() can happen if some data gets flushed before it is written to the commitlog
                    // (see https://issues.apache.org/jira/browse/CASSANDRA-2285)
                    logger.debug("skipping replay of fully-flushed {}", file);
                    continue;
                }

                reader.seek(replayPosition);

                if (logger.isDebugEnabled())
                    logger.debug("Replaying " + file + " starting at " + reader.getFilePointer());

                /* read the logs populate RowMutation and apply */
                while (!reader.isEOF())
                {
                    if (logger.isDebugEnabled())
                        logger.debug("Reading mutation at " + reader.getFilePointer());

                    long claimedCRC32;
                    int serializedSize;
                    try
                    {
                        // any of the reads may hit EOF
                        serializedSize = reader.readInt();
                        // RowMutation must be at LEAST 10 bytes:
                        // 3 each for a non-empty Table and Key (including the 2-byte length from
                        // writeUTF/writeWithShortLength) and 4 bytes for column count.
                        // This prevents CRC by being fooled by special-case garbage in the file; see CASSANDRA-2128
                        if (serializedSize < 10)
                            break;
                        long claimedSizeChecksum = reader.readLong();
                        checksum.reset();
                        checksum.update(serializedSize);
                        if (checksum.getValue() != claimedSizeChecksum)
                            break; // entry wasn't synced correctly/fully.  that's ok.

                        if (serializedSize > bytes.length)
                            bytes = new byte[(int) (1.2 * serializedSize)];
                        reader.readFully(bytes, 0, serializedSize);
                        claimedCRC32 = reader.readLong();
                    }
                    catch(EOFException eof)
                    {
                        break; // last CL entry didn't get completely written.  that's ok.
                    }

                    checksum.update(bytes, 0, serializedSize);
                    if (claimedCRC32 != checksum.getValue())
                    {
                        // this entry must not have been fsynced.  probably the rest is bad too,
                        // but just in case there is no harm in trying them (since we still read on an entry boundary)
                        continue;
                    }

                    /* deserialize the commit log entry */
                    ByteArrayInputStream bufIn = new ByteArrayInputStream(bytes, 0, serializedSize);
                    RowMutation rm = null;
                    try
                    {
                        // assuming version here. We've gone to lengths to make sure what gets written to the CL is in
                        // the current version.  so do make sure the CL is drained prior to upgrading a node.
                        rm = RowMutation.serializer().deserialize(new DataInputStream(bufIn), MessagingService.version_, false);
                    }
                    catch (UnserializableColumnFamilyException ex)
                    {
                        AtomicInteger i = invalidMutations.get(ex.cfId);
                        if (i == null)
                        {
                            i = new AtomicInteger(1);
                            invalidMutations.put(ex.cfId, i);
                        }
                        else
                            i.incrementAndGet();
                        continue;
                    }
                    
                    if (logger.isDebugEnabled())
                        logger.debug(String.format("replaying mutation for %s.%s: %s",
                                                    rm.getTable(),
                                                    ByteBufferUtil.bytesToHex(rm.key()),
                                                    "{" + StringUtils.join(rm.getColumnFamilies(), ", ") + "}"));

                    final long entryLocation = reader.getFilePointer();
                    final RowMutation frm = rm;
                    Runnable runnable = new WrappedRunnable()
                    {
                        public void runMayThrow() throws IOException
                        {
                            if (DatabaseDescriptor.getKSMetaData(frm.getTable()) == null)
                                return;
                            final Table table = Table.open(frm.getTable());
                            RowMutation newRm = new RowMutation(frm.getTable(), frm.key());

                            // Rebuild the row mutation, omitting column families that a) have already been flushed,
                            // b) are part of a cf that was dropped. Keep in mind that the cf.name() is suspect. do every
                            // thing based on the cfid instead.
                            for (ColumnFamily columnFamily : frm.getColumnFamilies())
                            {
                                if (CFMetaData.getCF(columnFamily.id()) == null)
                                    // null means the cf has been dropped
                                    continue;

                                ReplayPosition rp = cfPositions.get(columnFamily.id());

                                // replay if current segment is newer than last flushed one or, if it is the last known
                                // segment, if we are after the replay position
                                if (segment > rp.segment || (segment == rp.segment && entryLocation > rp.position))
                                {
                                    newRm.add(columnFamily);
                                    replayedCount.incrementAndGet();
                                }
                            }
                            if (!newRm.isEmpty())
                            {
                                Table.open(newRm.getTable()).apply(newRm, false);
                                tablesRecovered.add(table);
                            }
                        }
                    };
                    futures.add(StageManager.getStage(Stage.MUTATION).submit(runnable));
                    if (futures.size() > MAX_OUTSTANDING_REPLAY_COUNT)
                    {
                        FBUtilities.waitOnFutures(futures);
                        futures.clear();
                    }
                }
            }
            finally
            {
                FileUtils.closeQuietly(reader);
                logger.info("Finished reading " + file);
            }
        }
        
        for (Map.Entry<Integer, AtomicInteger> entry : invalidMutations.entrySet())
            logger.info(String.format("Skipped %d mutations from unknown (probably removed) CF with id %d", entry.getValue().intValue(), entry.getKey()));

        // wait for all the writes to finish on the mutation stage
        FBUtilities.waitOnFutures(futures);
        logger.debug("Finished waiting on mutations from recovery");

        // flush replayed tables
        futures.clear();
        for (Table table : tablesRecovered)
            futures.addAll(table.flush());
        FBUtilities.waitOnFutures(futures);

        return replayedCount.get();
    }

    private CommitLogSegment currentSegment()
    {
        return segments.getLast();
    }
    
    public ReplayPosition getContext()
    {
        Callable<ReplayPosition> task = new Callable<ReplayPosition>()
        {
            public ReplayPosition call() throws Exception
            {
                return currentSegment().getContext();
            }
        };
        try
        {
            return executor.submit(task).get();
        }
        catch (InterruptedException e)
        {
            throw new RuntimeException(e);
        }
        catch (ExecutionException e)
        {
            throw new RuntimeException(e);
        }
    }

    // for tests mainly
    public int segmentsCount()
    {
        return segments.size();
    }

    /*
     * Adds the specified row to the commit log. This method will reset the
     * file offset to what it is before the start of the operation in case
     * of any problems. This way we can assume that the subsequent commit log
     * entry will override the garbage left over by the previous write.
    */
    public void add(RowMutation rowMutation) throws IOException
    {
        executor.add(new LogRecordAdder(rowMutation));
    }

    /*
     * This is called on Memtable flush to add to the commit log
     * a token indicating that this column family has been flushed.
     * The bit flag associated with this column family is set in the
     * header and this is used to decide if the log file can be deleted.
    */
    public void discardCompletedSegments(final Integer cfId, final ReplayPosition context) throws IOException
    {
        Callable task = new Callable()
        {
            public Object call() throws IOException
            {
                discardCompletedSegmentsInternal(context, cfId);
                return null;
            }
        };
        try
        {
            executor.submit(task).get();
        }
        catch (InterruptedException e)
        {
            throw new RuntimeException(e);
        }
        catch (ExecutionException e)
        {
            throw new RuntimeException(e);
        }
    }

    /**
     * Delete log segments whose contents have been turned into SSTables. NOT threadsafe.
     *
     * param @ context The commitLog context .
     * param @ id id of the columnFamily being flushed to disk.
     *
    */
    private void discardCompletedSegmentsInternal(ReplayPosition context, Integer id) throws IOException
    {
        if (logger.isDebugEnabled())
            logger.debug("discard completed log segments for " + context + ", column family " + id + ".");

        /*
         * Loop through all the commit log files in the history. Now process
         * all files that are older than the one in the context. For each of
         * these files the header needs to modified by resetting the dirty
         * bit corresponding to the flushed CF.
        */
        Iterator<CommitLogSegment> iter = segments.iterator();
        while (iter.hasNext())
        {
            CommitLogSegment segment = iter.next();
            if (segment.id == context.segment)
            {
                // Only unmark this segment if there were not write since the
                // ReplayPosition was grabbed.
                segment.turnOffIfNotWritten(id, context.position);
                maybeDiscardSegment(segment, iter);
                break;
            }

            segment.turnOff(id);
            maybeDiscardSegment(segment, iter);
        }
    }

    private void maybeDiscardSegment(CommitLogSegment segment, Iterator<CommitLogSegment> iter)
    {
        if (segment.isSafeToDelete() && iter.hasNext())
        {
            logger.info("Discarding obsolete commit log:" + segment);
            segment.close();
            DeletionService.executeDelete(segment.getPath());
            // usually this will be the first (remaining) segment, but not always, if segment A contains
            // writes to a CF that is unflushed but is followed by segment B whose CFs are all flushed.
            iter.remove();
        }
        else
        {
            if (logger.isDebugEnabled())
                logger.debug("Not safe to delete commit log " + segment + "; dirty is " + segment.dirtyString() + "; hasNext: " + iter.hasNext());
        }
    }

    void sync() throws IOException
    {
        currentSegment().sync();
    }

    public void forceNewSegment()
    {
        Callable<?> task = new Callable()
        {
            public Object call() throws IOException
            {
                createNewSegment();
                return null;
            }
        };

        try
        {
            executor.submit(task).get();
        }
        catch (InterruptedException e)
        {
            throw new AssertionError(e);
        }
        catch (ExecutionException e)
        {
            throw new RuntimeException(e);
        }
    }

    private void createNewSegment() throws IOException
    {
        sync();
        segments.add(new CommitLogSegment());
    }

    // TODO this should be a Runnable since it doesn't actually return anything, but it's difficult to do that
    // without breaking the fragile CheaterFutureTask in BatchCLES.
    class LogRecordAdder implements Callable, Runnable
    {
        final RowMutation rowMutation;

        LogRecordAdder(RowMutation rm)
        {
            this.rowMutation = rm;
        }

        public void run()
        {
            try
            {
                currentSegment().write(rowMutation);
                // roll log if necessary
                if (currentSegment().length() >= segmentSize)
                    createNewSegment();
            }
            catch (IOException e)
            {
                throw new IOError(e);
            }
        }

        public Object call() throws Exception
        {
            run();
            return null;
        }
    }

    public void shutdownBlocking() throws InterruptedException
    {
        executor.shutdown();
        executor.awaitTermination();
    }
}