ColumnFamilyStore.java example

Explorer
apache-cassandra-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.cassandra.db;

import java.io.*;
import java.lang.management.ManagementFactory;
import java.nio.charset.Charset;
import java.util.*;
import java.util.concurrent.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.locks.Condition;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.management.MBeanServer;
import javax.management.ObjectName;

import com.google.common.base.Predicate;
import com.google.common.collect.Iterables;
import com.google.common.collect.Iterators;
import org.apache.log4j.Logger;
import org.apache.commons.collections.IteratorUtils;

import org.apache.cassandra.concurrent.JMXEnabledThreadPoolExecutor;
import org.apache.cassandra.concurrent.NamedThreadFactory;
import org.apache.cassandra.concurrent.RetryingScheduledThreadPoolExecutor;
import org.apache.cassandra.config.CFMetaData;
import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.db.commitlog.CommitLog;
import org.apache.cassandra.db.commitlog.CommitLogSegment;
import org.apache.cassandra.db.commitlog.CommitLogSegment.CommitLogContext;
import org.apache.cassandra.db.filter.*;
import org.apache.cassandra.db.marshal.AbstractType;
import org.apache.cassandra.dht.AbstractBounds;
import org.apache.cassandra.dht.Bounds;
import org.apache.cassandra.dht.Range;
import org.apache.cassandra.io.*;
import org.apache.cassandra.io.util.FileUtils;
import org.apache.cassandra.service.StorageProxy;
import org.apache.cassandra.service.StorageService;
import org.apache.cassandra.thrift.SliceRange;
import org.apache.cassandra.utils.*;

public class ColumnFamilyStore implements ColumnFamilyStoreMBean
{
    private static final ScheduledThreadPoolExecutor cacheSavingExecutor =
            new RetryingScheduledThreadPoolExecutor("CACHE-SAVER", Thread.MIN_PRIORITY);

    private static Logger logger_ = Logger.getLogger(ColumnFamilyStore.class);

    /*
     * submitFlush first puts [Binary]Memtable.getSortedContents on the flushSorter executor,
     * which then puts the sorted results on the writer executor.  This is because sorting is CPU-bound,
     * and writing is disk-bound; we want to be able to do both at once.  When the write is complete,
     * we turn the writer into an SSTableReader and add it to ssTables_ where it is available for reads.
     *
     * For BinaryMemtable that's about all that happens.  For live Memtables there are two other things
     * that switchMemtable does (which should be the only caller of submitFlush in this case).
     * First, it puts the Memtable into memtablesPendingFlush, where it stays until the flush is complete
     * and it's been added as an SSTableReader to ssTables_.  Second, it adds an entry to postFlushExecutor
     * that waits for the flush to complete, then calls onMemtableFlush.  This allows multiple flushes
     * to happen simultaneously on multicore systems, while still calling onMF in the correct order,
     * which is necessary for replay in case of a restart since CommitLog assumes that when onMF is
     * called, all data up to the given context has been persisted to SSTables.
     */
    private static final ExecutorService flushSorter
            = new JMXEnabledThreadPoolExecutor(1,
                                               Runtime.getRuntime().availableProcessors(),
                                               Integer.MAX_VALUE,
                                               TimeUnit.SECONDS,
                                               new ArrayBlockingQueue<Runnable>(Runtime.getRuntime().availableProcessors()),
                                               new NamedThreadFactory("FLUSH-SORTER-POOL"));
    private static final ExecutorService flushWriter
            = new JMXEnabledThreadPoolExecutor(1,
                                               DatabaseDescriptor.getAllDataFileLocations().length,
                                               Integer.MAX_VALUE,
                                               TimeUnit.SECONDS,
                                               new ArrayBlockingQueue<Runnable>(DatabaseDescriptor.getAllDataFileLocations().length
                                                       + DatabaseDescriptor.getFlushQueueSize()),
                                               new NamedThreadFactory("FLUSH-WRITER-POOL",DatabaseDescriptor.getCompactionPriority()));
    public static final ExecutorService postFlushExecutor = new JMXEnabledThreadPoolExecutor("MEMTABLE-POST-FLUSHER");

    private static final int KEY_RANGE_FILE_BUFFER_SIZE = 256 * 1024;

    private Set<Memtable> memtablesPendingFlush = new ConcurrentSkipListSet<Memtable>();

    private final String table_;
    public final String columnFamily_;
    private final boolean isSuper_;
    
    public final CFMetaData metadata;

    private volatile Integer memtableSwitchCount = 0;

    /* This is used to generate the next index for a SSTable */
    private AtomicInteger fileIndexGenerator_ = new AtomicInteger(0);

    /* active memtable associated with this ColumnFamilyStore. */
    private Memtable memtable_;

    // TODO binarymemtable ops are not threadsafe (do they need to be?)
    private AtomicReference<BinaryMemtable> binaryMemtable_;

    /* SSTables on disk for this column family */
    private SSTableTracker ssTables_;

    private LatencyTracker readStats_ = new LatencyTracker();
    private LatencyTracker writeStats_ = new LatencyTracker();

    private long minRowCompactedSize = 0L;
    private long maxRowCompactedSize = 0L;
    private long rowsCompactedTotalSize = 0L;
    private long rowsCompactedCount = 0L;
    private Runnable rowCacheWriteTask;
    private Runnable keyCacheWriteTask;

    ColumnFamilyStore(String table, String columnFamilyName, boolean isSuper, int indexValue) throws IOException
    {
        table_ = table;
        columnFamily_ = columnFamilyName;
        isSuper_ = isSuper;
        
        metadata = DatabaseDescriptor.getCFMetaData(table, columnFamilyName);
        
        fileIndexGenerator_.set(indexValue);
        memtable_ = new Memtable(this);
        binaryMemtable_ = new AtomicReference<BinaryMemtable>(new BinaryMemtable(this));

        if (logger_.isDebugEnabled())
            logger_.debug("Starting CFS " + columnFamily_);
        // scan for data files corresponding to this CF
        List<File> sstableFiles = new ArrayList<File>();
        Pattern auxFilePattern = Pattern.compile("(.*)(-Filter\\.db$|-Index\\.db$)");
        Pattern tmpCacheFilePattern = Pattern.compile(table + "-" + columnFamilyName + "-(Key|Row)Cache.*\\.tmp$");
        for (File file : files())
        {
            String filename = file.getName();

            /* look for and remove orphans. An orphan is a -Filter.db or -Index.db with no corresponding -Data.db. */
            Matcher matcher = auxFilePattern.matcher(file.getAbsolutePath());
            if (matcher.matches())
            {
                String basePath = matcher.group(1);
                if (!new File(basePath + "-Data.db").exists())
                {
                    logger_.info(String.format("Removing orphan %s", file.getAbsolutePath()));
                    FileUtils.deleteWithConfirm(file);
                    continue;
                }
            }

            if (((file.length() == 0 && !filename.endsWith("-Compacted")) || (filename.contains("-" + SSTable.TEMPFILE_MARKER))))
            {
                FileUtils.deleteWithConfirm(file);
                continue;
            }

            if (tmpCacheFilePattern.matcher(filename).matches())
            {
                logger_.info("removing incomplete saved cache " + file.getAbsolutePath());
                FileUtils.deleteWithConfirm(file);
                continue;
            }

            if (filename.contains("-Data.db"))
            {
                sstableFiles.add(file.getAbsoluteFile());
            }
        }
        Collections.sort(sstableFiles, new FileUtils.FileComparator());

        // scan for sstables corresponding to this cf and load them
        ssTables_ = new SSTableTracker(table, columnFamilyName);
        Set<String> savedKeys = readSavedCache(DatabaseDescriptor.getSerializedKeyCachePath(table, columnFamilyName), false);
        List<SSTableReader> sstables = new ArrayList<SSTableReader>();
        for (File file : sstableFiles)
        {
            String filename = file.getAbsolutePath();
            if (SSTable.deleteIfCompacted(filename))
                continue;

            SSTableReader sstable;
            try
            {
                sstable = SSTableReader.open(filename, savedKeys, ssTables_);
            }
            catch (IOException ex)
            {
                logger_.error("Corrupt file " + filename + "; skipped", ex);
                throw ex;
            }
            sstables.add(sstable);
        }
        ssTables_.add(sstables);
    }

    protected Set<String> readSavedCache(File path, boolean sort)
    {
        Set<String> keys;
        if (sort)
        {
            // sort the results on read because cache may be written many times during server lifetime,
            // so better to pay that price once on startup than sort at write time.
            keys = new TreeSet<String>(StorageProxy.keyComparator);
        }
        else
        {
            keys = new HashSet<String>();
        }

        if (path.exists())
        {
            try
            {
                long start = System.currentTimeMillis();
                logger_.info("reading saved cache " + path);
                ObjectInputStream in = new ObjectInputStream(new BufferedInputStream(new FileInputStream(path)));
                Charset UTF8 = Charset.forName("UTF-8");
                while (in.available() > 0)
                {
                    int size = in.readInt();
                    byte[] bytes = new byte[size];
                    in.readFully(bytes);
                    keys.add(new String(bytes, UTF8));
                }
                in.close();
                if (logger_.isDebugEnabled())
                    logger_.debug(String.format("completed reading (%d ms; %d keys) saved cache %s",
                                                (System.currentTimeMillis() - start), keys.size(), path));
            }
            catch (IOException ioe)
            {
                logger_.warn("error reading saved cache " + path, ioe);
            }
        }

        return keys;
    }

    // must be called after all sstables are loaded since row cache merges all row versions
    public void initRowCache()
    {
        String msgSuffix = String.format(" row cache for %s of %s", columnFamily_, table_);
        int rowCacheSavePeriodInSeconds = DatabaseDescriptor.getTableMetaData(table_).get(columnFamily_).rowCacheSavePeriodInSeconds;
        int keyCacheSavePeriodInSeconds = DatabaseDescriptor.getTableMetaData(table_).get(columnFamily_).keyCacheSavePeriodInSeconds;

        long start = System.currentTimeMillis();
        for (String key : readSavedCache(DatabaseDescriptor.getSerializedRowCachePath(table_, columnFamily_), true))
        {
            cacheRow(key);
        }
        if (ssTables_.getRowCache().getSize() > 0)
            logger_.info(String.format("completed loading (%d ms; %d keys) %s",
                                       System.currentTimeMillis() - start,
                                       ssTables_.getRowCache().getSize(),
                                       msgSuffix));

        rowCacheWriteTask = new WrappedRunnable()
        {
            protected void runMayThrow() throws IOException
            {
                ssTables_.saveRowCache();
            }
        };
        if (rowCacheSavePeriodInSeconds > 0)
        {
            cacheSavingExecutor.scheduleWithFixedDelay(rowCacheWriteTask,
                                                       rowCacheSavePeriodInSeconds,
                                                       rowCacheSavePeriodInSeconds,
                                                       TimeUnit.SECONDS);
        }

        keyCacheWriteTask = new WrappedRunnable()
        {
            protected void runMayThrow() throws IOException
            {
                ssTables_.saveKeyCache();
            }
        };
        if (keyCacheSavePeriodInSeconds > 0)
        {
            cacheSavingExecutor.scheduleWithFixedDelay(keyCacheWriteTask,
                                                       keyCacheSavePeriodInSeconds,
                                                       keyCacheSavePeriodInSeconds,
                                                       TimeUnit.SECONDS);
        }
    }

    public Future<?> submitKeyCacheWrite()
    {
        return cacheSavingExecutor.submit(keyCacheWriteTask);
    }

    public Future<?> submitRowCacheWrite()
    {
        return cacheSavingExecutor.submit(rowCacheWriteTask);
    }

    public void addToCompactedRowStats(Long rowsize)
    {
        if (minRowCompactedSize < 1 || rowsize < minRowCompactedSize)
            minRowCompactedSize = rowsize;
        if (rowsize > maxRowCompactedSize)
            maxRowCompactedSize = rowsize;
        rowsCompactedCount++;
        rowsCompactedTotalSize += rowsize;
    }

    public long getMinRowCompactedSize()
    {
        return minRowCompactedSize;
    }

    public long getMaxRowCompactedSize()
    {
        return maxRowCompactedSize;
    }

    public long getMeanRowCompactedSize()
    {
        if (rowsCompactedCount > 0)
            return rowsCompactedTotalSize / rowsCompactedCount;
        else
            return 0L;
    }

    public static ColumnFamilyStore createColumnFamilyStore(String table, String columnFamily) throws IOException
    {
        /*
         * Get all data files associated with old Memtables for this table.
         * These files are named as follows <Table>-1.db, ..., <Table>-n.db. Get
         * the max which in this case is n and increment it to use it for next
         * index.
         */
        List<Integer> generations = new ArrayList<Integer>();
        String[] dataFileDirectories = DatabaseDescriptor.getAllDataFileLocationsForTable(table);
        for (String directory : dataFileDirectories)
        {
            File fileDir = new File(directory);
            File[] files = fileDir.listFiles();
            
            for (File file : files)
            {
                String filename = file.getName();
                String cfName = getColumnFamilyFromFileName(filename);

                if (cfName.equals(columnFamily))
                {
                    generations.add(getGenerationFromFileName(filename));
                }
            }
        }
        Collections.sort(generations);
        int value = (generations.size() > 0) ? (generations.get(generations.size() - 1)) : 0;

        ColumnFamilyStore cfs = new ColumnFamilyStore(table, columnFamily, "Super".equals(DatabaseDescriptor.getColumnType(table, columnFamily)), value);

        MBeanServer mbs = ManagementFactory.getPlatformMBeanServer();
        try
        {
            String mbeanName = "org.apache.cassandra.db:type=ColumnFamilyStores,keyspace=" + table + ",columnfamily=" + columnFamily;
            mbs.registerMBean(cfs, new ObjectName(mbeanName));
        }
        catch (Exception e)
        {
            throw new RuntimeException(e);
        }

        return cfs;
    }

    private Set<File> files()
    {
        Set<File> fileSet = new HashSet<File>();
        for (String directory : DatabaseDescriptor.getAllDataFileLocationsForTable(table_))
        {
            File[] files = new File(directory).listFiles();
            for (File file : files)
            {
                String cfName = getColumnFamilyFromFileName(file.getName());
                if (cfName.equals(columnFamily_))
                    fileSet.add(file);
            }
        }
        return fileSet;
    }

    /**
     * @return the name of the column family
     */
    public String getColumnFamilyName()
    {
        return columnFamily_;
    }
    
    private static String getColumnFamilyFromFileName(String filename)
    {
        return filename.split("-")[0];
    }
    
    public static int getGenerationFromFileName(String filename)
    {
        /*
         * File name is of the form <table>-<column family>-<index>-Data.db.
         * This tokenizer will strip the .db portion.
         */
        StringTokenizer st = new StringTokenizer(filename, "-");
        /*
         * Now I want to get the index portion of the filename. We accumulate
         * the indices and then sort them to get the max index.
         */
        int count = st.countTokens();
        int i = 0;
        String index = null;
        while (st.hasMoreElements())
        {
            index = (String) st.nextElement();
            if (i == (count - 2))
            {
                break;
            }
            ++i;
        }
        return Integer.parseInt(index);
    }

    /*
     * @return a temporary file name for an sstable.
     * When the sstable object is closed, it will be renamed to a non-temporary
     * format, so incomplete sstables can be recognized and removed on startup.
     */
    public String getFlushPath() throws IOException
    {
        String location = estimateFlushPath();
        
        String ssTableFileName = getTempSSTableFileName();
        if (location!=null)
            return new File(location, ssTableFileName).getAbsolutePath();
        
        logger_.warn("Insufficient disk space to flush "+ssTableFileName+". Trying to force GC");

        try {
            // Hoping GC will remove some not used tables.
            System.gc();
        
            Thread.sleep(60000);
            
            location = estimateFlushPath();
            
            if (location!=null)
                return new File(location, ssTableFileName).getAbsolutePath();
            
            // Hoping GC will remove some not used tables - sometimes 2 GC cycles are needed.
            logger_.warn("Still Insufficient disk space to flush "+ssTableFileName+". Trying to force GC 2nd time");
            System.gc();
        
            Thread.sleep(60000);
            
            location = estimateFlushPath();
            
        } catch (InterruptedException e) {
        }
        
        if (location == null)
            throw new IOException("Insufficient disk space to flush "+ssTableFileName);

        return new File(location, ssTableFileName).getAbsolutePath();
    }

    /**
     * @return path with enough space on disk to write ss table or null, if no disk space left
     */
    private String estimateFlushPath()
    {
        long guessedSize = 2 * DatabaseDescriptor.getMemtableThroughput() * 1024*1024; // 2* adds room for keys, column indexes
        String location = DatabaseDescriptor.getDataFileLocation(this, guessedSize);

        return location;
    }

    public String getDataFileLocation(long expectedCompactedFileSize)
    {
        String path = DatabaseDescriptor.getDataFileLocation(this, expectedCompactedFileSize);
        if (path == null)
        {
            // retry after GCing to force unmap of compacted SSTables so they can be deleted
            StorageService.instance.requestGC();
            try
            {
                Thread.sleep(SSTableDeletingReference.RETRY_DELAY * 2);
            }
            catch (InterruptedException e)
            {
                throw new AssertionError(e);
            }
            path = DatabaseDescriptor.getDataFileLocation(this, expectedCompactedFileSize);
        }
        return path;
    }

    public String getTempSSTableFileName()
    {
        return String.format("%s-%s-%s-Data.db",
                             columnFamily_, SSTable.TEMPFILE_MARKER, fileIndexGenerator_.incrementAndGet());
    }

    /** flush the given memtable and swap in a new one for its CFS, if it hasn't been frozen already.  threadsafe. */
    Future<?> maybeSwitchMemtable(Memtable oldMemtable, final boolean writeCommitLog) 
    {
        if (oldMemtable.isFrozen())
        {
            logger_.debug("memtable is already frozen; another thread must be flushing it");
            return null;
        }

        /**
         *  If we can get the writelock, that means no new updates can come in and 
         *  all ongoing updates to memtables have completed. We can get the tail
         *  of the log and use it as the starting position for log replay on recovery.
         */
        Table.flusherLock.writeLock().lock();
        try
        {
            if (oldMemtable.isFrozen())
            {
                return null;
            }
            oldMemtable.freeze();

            // dont wait with write lock held for commit log queue - only obtain future and continue
            // the future due to the single threaded nature of commit log will position itself in queue
            // after all mutations which possibly have been written to flushing memtable and
            // on get() will return commit log position of memtable flush start
            final Future<CommitLogContext> ctx = writeCommitLog ? CommitLog.instance().getContext() : null;
//            logger_.info(columnFamily_ + " has reached its threshold; switching in a fresh Memtable at " + ctx);
            final Condition condition = submitFlush(oldMemtable);
            memtable_ = new Memtable(this);
            // a second executor that makes sure the onMemtableFlushes get called in the right order,
            // while keeping the wait-for-flush (future.get) out of anything latency-sensitive.
            return postFlushExecutor.submit(new WrappedRunnable()
            {
                public void runMayThrow() throws InterruptedException, IOException
                {
                    condition.await();
                    if (writeCommitLog)
                    {
                        // if we're not writing to the commit log, we are replaying the log, so marking
                        // the log header with "you can discard anything written before the context" is not valid
                        try {
                            CommitLogContext ctxValue = ctx.get();
                            logger_.info(columnFamily_ + " has reached its threshold; switched in a fresh Memtable at " + ctxValue);
                            CommitLog.instance().discardCompletedSegments(table_, columnFamily_, ctxValue);
                        } catch (ExecutionException e) {
                            throw new RuntimeException(e);
                        }
                    }
                }
            });
        }
        finally
        {
            Table.flusherLock.writeLock().unlock();
            if (memtableSwitchCount == Integer.MAX_VALUE)
            {
                memtableSwitchCount = 0;
            }
            memtableSwitchCount++;
        }
    }

    void switchBinaryMemtable(String key, byte[] buffer) 
    {
        binaryMemtable_.set(new BinaryMemtable(this));
        binaryMemtable_.get().put(key, buffer);
    }
    
    public void setListener(IStoreApplyListener applyFilter)
    {
        getTable().setStoreListener(this, applyFilter);
    }

    public void forceFlushIfExpired() throws ExecutionException, InterruptedException
    {
        if (memtable_.isExpired())
            forceBlockingFlush();
    }

    public Future<?> forceFlush() 
    {
        if (memtable_.isClean())
            return null;

        return maybeSwitchMemtable(memtable_, true);
    }

    public void forceBlockingFlush() throws ExecutionException, InterruptedException
    {
        Future<?> future = forceFlush();
        if (future != null)
            future.get();
    }

    public void forceFlushBinary()
    {
        if (binaryMemtable_.get().isClean())
            return;

        submitFlush(binaryMemtable_.get());
    }

    /**
     * Insert/Update the column family for this key.
     * Caller is responsible for acquiring Table.flusherLock!
     * param @ lock - lock that needs to be used.
     * param @ key - key for update/insert
     * param @ columnFamily - columnFamily changes
     */
    Memtable apply(String key, ColumnFamily columnFamily) throws IOException
    {
        long start = System.nanoTime();
        
        boolean flushRequested = memtable_.isThresholdViolated();
        memtable_.put(key, columnFamily);
        writeStats_.addNano(System.nanoTime() - start);
        
        return flushRequested ? memtable_ : null;
    }

    /*
     * Insert/Update the column family for this key. param @ lock - lock that
     * needs to be used. param @ key - key for update/insert param @
     * columnFamily - columnFamily changes
     */
    void applyBinary(String key, byte[] buffer) 
    {
        long start = System.nanoTime();
        binaryMemtable_.get().put(key, buffer);
        writeStats_.addNano(System.nanoTime() - start);
    }

    /*
     This is complicated because we need to preserve deleted columns, supercolumns, and columnfamilies
     until they have been deleted for at least GC_GRACE_IN_SECONDS.  But, we do not need to preserve
     their contents; just the object itself as a "tombstone" that can be used to repair other
     replicas that do not know about the deletion.
     */
    public static ColumnFamily removeDeleted(ColumnFamily cf, int gcBefore)
    {
        if (cf == null)
        {
            return null;
        }

        if (cf.isSuper())
            removeDeletedSuper(cf, gcBefore);
        else
            removeDeletedStandard(cf, gcBefore);

        // in case of a timestamp tie, tombstones get priority over non-tombstones.
        // (we want this to be deterministic to avoid confusion.)
        if (cf.getColumnCount() == 0 && cf.getLocalDeletionTime() <= gcBefore)
        {
            return null;
        }
        return cf;
    }

    private static void removeDeletedStandard(ColumnFamily cf, int gcBefore)
    {
        for (Iterator<IColumn> iterator = cf.getSortedColumns().iterator(); iterator.hasNext();) {
            IColumn c = iterator.next();
            if ((c.isMarkedForDelete() && c.getLocalDeletionTime() <= gcBefore)
                || c.timestamp() <= cf.getMarkedForDeleteAt())
            {
                iterator.remove();
            }
        }
    }

    private static void removeDeletedSuper(ColumnFamily cf, int gcBefore)
    {
        // TODO assume deletion means "most are deleted?" and add to clone, instead of remove from original?
        // this could be improved by having compaction, or possibly even removeDeleted, r/m the tombstone
        // once gcBefore has passed, so if new stuff is added in it doesn't used the wrong algorithm forever
        for (Iterator<IColumn> iterator = cf.getSortedColumns().iterator(); iterator.hasNext();) {
            IColumn c = iterator.next();
            long minTimestamp = Math.max(c.getMarkedForDeleteAt(), cf.getMarkedForDeleteAt());
            for (IColumn subColumn : c.getSubColumns())
            {
                if (subColumn.timestamp() <= minTimestamp
                    || (subColumn.isMarkedForDelete() && subColumn.getLocalDeletionTime() <= gcBefore))
                {
                    ((SuperColumn)c).remove(subColumn.name());
                }
            }
            if (c.getSubColumns().isEmpty() && c.getLocalDeletionTime() <= gcBefore)
            {
                iterator.remove();
            }
        }
    }

    /**
     * Uses bloom filters to check if key may be present in any sstable in this
     * ColumnFamilyStore, minus a set of provided ones.
     *
     * Because BFs are checked, negative returns ensure that the key is not
     * present in the checked SSTables, but positive ones doesn't ensure key
     * presence.
     */
    public boolean isKeyInRemainingSSTables(DecoratedKey key, Set<SSTable> sstablesToIgnore)
    {
        for (SSTableReader sstable : ssTables_)
        {
            if (!sstablesToIgnore.contains(sstable) && sstable.getBloomFilter().isPresent(key.key))
                return true;
        }
        return false;
    }

    /*
     * Called after the Memtable flushes its in-memory data, or we add a file
     * via bootstrap. This information is
     * cached in the ColumnFamilyStore. This is useful for reads because the
     * ColumnFamilyStore first looks in the in-memory store and the into the
     * disk to find the key. If invoked during recoveryMode the
     * onMemtableFlush() need not be invoked.
     *
     * param @ filename - filename just flushed to disk
     */
    public void addSSTable(SSTableReader sstable)
    {
        ssTables_.add(Arrays.asList(sstable));
        CompactionManager.instance.submitMinorIfNeeded(this);
    }

    /*
     * Add up all the files sizes this is the worst case file
     * size for compaction of all the list of files given.
     */
    long getExpectedCompactedFileSize(Iterable<SSTableReader> sstables)
    {
        long expectedFileSize = 0;
        for (SSTableReader sstable : sstables)
        {
            long size = sstable.length();
            expectedFileSize = expectedFileSize + size;
        }
        return expectedFileSize;
    }

    /*
     *  Find the maximum size file in the list .
     */
    SSTableReader getMaxSizeFile(Iterable<SSTableReader> sstables)
    {
        long maxSize = 0L;
        SSTableReader maxFile = null;
        for (SSTableReader sstable : sstables)
        {
            if (sstable.length() > maxSize)
            {
                maxSize = sstable.length();
                maxFile = sstable;
            }
        }
        return maxFile;
    }

    public void forceCleanup()
    {
        CompactionManager.instance.submitCleanup(ColumnFamilyStore.this);
    }

    public Table getTable()
    {
        try
        {
            return Table.open(table_);
        }
        catch (IOException e)
        {
            throw new FSReadError(e);
        }
    }

    public void markCompacted(Collection<SSTableReader> sstables) throws IOException
    {
        ssTables_.markCompacted(sstables);
    }

    boolean isCompleteSSTables(Collection<SSTableReader> sstables)
    {
        return ssTables_.getSSTables().equals(new HashSet<SSTableReader>(sstables));
    }

    void replaceCompactedSSTables(Collection<SSTableReader> sstables, Iterable<SSTableReader> replacements)
    {
        ssTables_.replace(sstables, replacements);
    }

    /**
     * submits flush sort on the flushSorter executor, which will in turn submit to flushWriter when sorted.
     * TODO because our executors use CallerRunsPolicy, when flushSorter fills up, no writes will proceed
     * because the next flush will start executing on the caller, mutation-stage thread that has the
     * flush write lock held.  (writes aquire this as a read lock before proceeding.)
     * This is good, because it backpressures flushes, but bad, because we can't write until that last
     * flushing thread finishes sorting, which will almost always be longer than any of the flushSorter threads proper
     * (since, by definition, it started last).
     */
    Condition submitFlush(IFlushable flushable)
    {
        logger_.info("Enqueuing flush of " + flushable);
        final Condition condition = new SimpleCondition();
        flushable.flushAndSignal(condition, flushSorter, flushWriter);
        return condition;
    }

    public boolean isSuper()
    {
        return isSuper_;
    }

    public int getMemtableColumnsCount()
    {
        return getMemtableThreadSafe().getCurrentOperations();
    }

    public int getMemtableDataSize()
    {
        return getMemtableThreadSafe().getCurrentThroughput();
    }

    public int getMemtableSwitchCount()
    {
        return memtableSwitchCount;
    }

    /**
     * get the current memtable in a threadsafe fashion.  note that simply "return memtable_" is
     * incorrect; you need to lock to introduce a thread safe happens-before ordering.
     *
     * do NOT use this method to do either a put or get on the memtable object, since it could be
     * flushed in the meantime (and its executor terminated).
     *
     * also do NOT make this method public or it will really get impossible to reason about these things.
     * @return
     */
    private Memtable getMemtableThreadSafe()
    {
        Table.flusherLock.readLock().lock();
        try
        {
            return memtable_;
        }
        finally
        {
            Table.flusherLock.readLock().unlock();
        }
    }

    public Iterator<DecoratedKey> memtableKeyIterator(DecoratedKey startWith) throws ExecutionException, InterruptedException
    {
        Table.flusherLock.readLock().lock();
        try
        {
             return memtable_.getKeyIterator(startWith);
        }
        finally
        {
            Table.flusherLock.readLock().unlock();
        }
    }

    public Iterator<Map.Entry<DecoratedKey, ColumnFamily>> memtableEntryIterator()
    {
        Table.flusherLock.readLock().lock();
        try
        {
            return memtable_.getEntryIterator();
        }
        finally
        {
            Table.flusherLock.readLock().unlock();
        }
    }

    public Collection<SSTableReader> getSSTables()
    {
        return ssTables_.getSSTables();
    }

    public long getReadCount()
    {
        return readStats_.getOpCount();
    }

    public double getRecentReadLatencyMicros()
    {
        return readStats_.getRecentLatencyMicros();
    }

    public long[] getLifetimeReadLatencyHistogramMicros()
    {
        return readStats_.getTotalLatencyHistogramMicros();
    }

    public long[] getRecentReadLatencyHistogramMicros()
    {
        return readStats_.getRecentLatencyHistogramMicros();
    }

    public long getTotalReadLatencyMicros()
    {
        return readStats_.getTotalLatencyMicros();
    }

// TODO this actually isn't a good meature of pending tasks
    public int getPendingTasks()
    {
        return Table.flusherLock.getQueueLength();
    }

    public long getWriteCount()
    {
        return writeStats_.getOpCount();
    }

    public long getTotalWriteLatencyMicros()
    {
        return writeStats_.getTotalLatencyMicros();
    }

    public double getRecentWriteLatencyMicros()
    {
        return writeStats_.getRecentLatencyMicros();
    }

    public long[] getLifetimeWriteLatencyHistogramMicros()
    {
        return writeStats_.getTotalLatencyHistogramMicros();
    }

    public long[] getRecentWriteLatencyHistogramMicros()
    {
        return writeStats_.getRecentLatencyHistogramMicros();
    }

    public ColumnFamily getColumnFamily(String key, QueryPath path, byte[] start, byte[] finish, boolean reversed, int limit) 
    {
        return getColumnFamily(new SliceQueryFilter(key, path, start, finish, reversed, limit));
    }

    public ColumnFamily getColumnFamily(QueryFilter filter) 
    {
        return getColumnFamily(filter, CompactionManager.getDefaultGcBefore(this));
    }

    private ColumnFamily cacheRow(String key) 
    {
        ColumnFamily cached;
        if ((cached = ssTables_.getRowCache().get(key)) == null)
        {
            cached = getTopLevelColumns(new IdentityQueryFilter(key, new QueryPath(columnFamily_)), Integer.MIN_VALUE);
            if (cached == null)
                return null;
            ssTables_.getRowCache().put(key, cached);
        }
        return cached;
    }

    /**
     * get a list of columns starting from a given column, in a specified order.
     * only the latest version of a column is returned.
     * @return null if there is no data and no tombstones; otherwise a ColumnFamily
     */
    public ColumnFamily getColumnFamily(QueryFilter filter, int gcBefore) 
    {
        assert columnFamily_.equals(filter.getColumnFamilyName());

        long start = System.nanoTime();
        try
        {
            if (filter.path.superColumnName == null)
            {
                if (ssTables_.getRowCache().getCapacity() == 0)
                    return removeDeleted(getTopLevelColumns(filter, gcBefore), gcBefore);

                ColumnFamily cached = cacheRow(filter.key);
                ColumnIterator ci = filter.getMemColumnIterator(memtable_, cached, getComparator()); // TODO passing memtable here is confusing since it's almost entirely unused
                ColumnFamily returnCF = ci.getColumnFamily();
                filter.collectCollatedColumns(returnCF, ci, gcBefore);
                return removeDeleted(returnCF, gcBefore);
            }

            // we are querying subcolumns of a supercolumn: fetch the supercolumn with NQF, then filter in-memory.
            ColumnFamily cf;
            SuperColumn sc;
            if (ssTables_.getRowCache().getCapacity() == 0)
            {
                QueryFilter nameFilter = new NamesQueryFilter(filter.key, new QueryPath(columnFamily_), filter.path.superColumnName);
                cf = getTopLevelColumns(nameFilter, gcBefore);
                if (cf == null || cf.getColumnCount() == 0)
                    return cf;

                assert cf.getSortedColumns().size() == 1;
                sc = (SuperColumn)cf.getSortedColumns().iterator().next();
            }
            else
            {
                cf = cacheRow(filter.key);
                if (cf == null)
                    return null;
                sc = (SuperColumn)cf.getColumn(filter.path.superColumnName);
                if (sc == null)
                    return null;
                sc = (SuperColumn)sc.cloneMe();
            }

            // filterSuperColumn only looks at immediate parent (the supercolumn) when determining if a subcolumn
            // is still live, i.e., not shadowed by the parent's tombstone.  so, bump it up temporarily to the tombstone
            // time of the cf, if that is greater.
            long deletedAt = sc.getMarkedForDeleteAt();
            if (cf.getMarkedForDeleteAt() > deletedAt)
                sc.markForDeleteAt(sc.getLocalDeletionTime(), cf.getMarkedForDeleteAt());

            SuperColumn scFiltered = filter.filterSuperColumn(sc, gcBefore);
            ColumnFamily cfFiltered = cf.cloneMeShallow();
            scFiltered.markForDeleteAt(sc.getLocalDeletionTime(), deletedAt); // reset sc tombstone time to what it should be
            cfFiltered.addColumn(scFiltered);

            return removeDeleted(cfFiltered, gcBefore);
        }
        finally
        {
            readStats_.addNano(System.nanoTime() - start);
        }
    }

    private ColumnFamily getTopLevelColumns(QueryFilter filter, int gcBefore) 
    {
        // we are querying top-level columns, do a merging fetch with indexes.
        List<ColumnIterator> iterators = new ArrayList<ColumnIterator>();
        try
        {
            final ColumnFamily returnCF;
            ColumnIterator iter;

            /* add the current memtable */
            iter = filter.getMemColumnIterator(getMemtableThreadSafe(), getComparator());
            // TODO this is a little subtle: the Memtable ColumnIterator has to be a shallow clone of the source CF,
            // with deletion times set correctly, so we can use it as the "base" CF to add query results to.
            // (for sstable ColumnIterators we do not care if it is a shallow clone or not.)
            returnCF = iter.getColumnFamily();
            iterators.add(iter);

            /* add the memtables being flushed */
            for (Memtable memtable : getMemtablesPendingFlush())
            {
                iter = filter.getMemColumnIterator(memtable, getComparator());
                returnCF.delete(iter.getColumnFamily());
                iterators.add(iter);
            }

            /* add the SSTables on disk */
            for (SSTableReader sstable : ssTables_)
            {
                iter = filter.getSSTableColumnIterator(sstable);
                if (iter.getColumnFamily() != null)
                {
                    returnCF.delete(iter.getColumnFamily());
                    iterators.add(iter);
                }
            }

            Comparator<IColumn> comparator = filter.getColumnComparator(getComparator());
            Iterator collated = IteratorUtils.collatedIterator(comparator, iterators);
            filter.collectCollatedColumns(returnCF, collated, gcBefore);
            return removeDeleted(returnCF, gcBefore);
        } catch (IOException e) {
            throw new FSReadError(e);
        }
        finally
        {
            /* close all cursors */
            for (ColumnIterator ci : iterators)
            {
                try
                {
                    ci.close();
                }
                catch (Throwable th)
                {
                    logger_.error("error closing " + ci, th);
                }
            }
        }
    }

    /**
     * @param range: either a Bounds, which includes start key, or a Range, which does not.
     * @param maxResults
     * @return list of keys between startWith and stopAt

       TODO refactor better.  this is just getKeyRange w/o the deletion check, for the benefit of
       range_slice.  still opens one randomaccessfile per key, which sucks.  something like compactioniterator
       would be better.
     */
    private void getKeyRange(List<String> keys, final AbstractBounds range, int maxResults)
    throws ExecutionException, InterruptedException
    {
        final DecoratedKey startWith = new DecoratedKey(range.left, null);
        final DecoratedKey stopAt = new DecoratedKey(range.right, null);
        // create a CollatedIterator that will return unique keys from different sources
        // (current memtable, historical memtables, and SSTables) in the correct order.
        final List<Iterator<DecoratedKey>> iterators = new ArrayList<Iterator<DecoratedKey>>();

        // we iterate through memtables with a priority queue to avoid more sorting than necessary.
        // this predicate throws out the keys before the start of our range.
        Predicate<DecoratedKey> p = new Predicate<DecoratedKey>()
        {
            public boolean apply(DecoratedKey key)
            {
                return startWith.compareTo(key) <= 0
                       && (stopAt.isEmpty() || key.compareTo(stopAt) <= 0);
            }
        };

        // current memtable keys.  have to go through the CFS api for locking.
        iterators.add(Iterators.filter(memtableKeyIterator(startWith), p));
        // historical memtables
        for (Memtable memtable : memtablesPendingFlush)
        {
            iterators.add(Iterators.filter(memtable.getKeyIterator(startWith), p));
        }

        try {
            // sstables
            for (SSTableReader sstable : ssTables_)
            {
                final SSTableScanner scanner = sstable.getScanner(KEY_RANGE_FILE_BUFFER_SIZE);
                scanner.seekTo(startWith);
                Iterator<DecoratedKey> iter = new CloseableIterator<DecoratedKey>()
                {
                    public boolean hasNext()
                    {
                        return scanner.hasNext();
                    }
                    public DecoratedKey next()
                    {
                        return scanner.next().getKey();
                    }
                    public void remove()
                    {
                        throw new UnsupportedOperationException();
                    }
                    public void close() throws IOException
                    {
                        scanner.close();
                    }
                };
                assert iter instanceof Closeable; // otherwise we leak FDs
                iterators.add(iter);
            }
        } catch (IOException e) {
            throw new FSReadError(e);
        }

        Iterator<DecoratedKey> collated = IteratorUtils.collatedIterator(DecoratedKey.comparator, iterators);
        Iterable<DecoratedKey> reduced = new ReducingIterator<DecoratedKey, DecoratedKey>(collated) {
            DecoratedKey current;

            public void reduce(DecoratedKey current)
            {
                 this.current = current;
            }

            protected DecoratedKey getReduced()
            {
                return current;
            }
        };

        try
        {
            // pull keys out of the CollatedIterator
            boolean first = true;
            for (DecoratedKey current : reduced)
            {
                if (!stopAt.isEmpty() && stopAt.compareTo(current) < 0)
                {
                    return;
                }

                if (range instanceof Bounds || !first || !current.equals(startWith))
                {
                    if (logger_.isDebugEnabled())
                        logger_.debug("scanned " + current);
                    keys.add(current.key);
                }
                first = false;

                if (keys.size() >= maxResults)
                {
                    return;
                }
            }
        }
        finally
        {
            for (Iterator iter : iterators)
            {
                if (iter instanceof Closeable)
                {
                    try {
                        ((Closeable)iter).close();
                    } catch (IOException e) {
                        logger_.error("",e);
                    }
                }
            }
        }
    }

    /**
     *
     * @param super_column
     * @param range: either a Bounds, which includes start key, or a Range, which does not.
     * @param keyMax maximum number of keys to process, regardless of startKey/finishKey
     * @param sliceRange may be null if columnNames is specified. specifies contiguous columns to return in what order.
     * @param columnNames may be null if sliceRange is specified. specifies which columns to return in what order.      @return list of key->list<column> tuples.
     * @throws ExecutionException
     * @throws InterruptedException
     */
    public RangeSliceReply getRangeSlice(byte[] super_column, final AbstractBounds range, int keyMax, SliceRange sliceRange, List<byte[]> columnNames)
    throws ExecutionException, InterruptedException
    {
        List<String> keys = new ArrayList<String>();
        assert range instanceof Bounds
               || (!((Range)range).isWrapAround() || range.right.equals(StorageService.getPartitioner().getMinimumToken()))
               : range;
        getKeyRange(keys, range, keyMax);
        List<Row> rows = new ArrayList<Row>(keys.size());
        final QueryPath queryPath =  new QueryPath(columnFamily_, super_column, null);
        final SortedSet<byte[]> columnNameSet = new TreeSet<byte[]>(getComparator());
        if (columnNames != null)
            columnNameSet.addAll(columnNames);
        for (String key : keys)
        {
            QueryFilter filter = sliceRange == null ? new NamesQueryFilter(key, queryPath, columnNameSet) : new SliceQueryFilter(key, queryPath, sliceRange.getStart(), sliceRange.getFinish(), sliceRange.reversed, sliceRange.count);
            rows.add(new Row(key, getColumnFamily(filter)));
        }

        return new RangeSliceReply(rows);
    }

    public AbstractType getComparator()
    {
        return DatabaseDescriptor.getComparator(table_, columnFamily_);
    }

    /**
     * Take a snap shot of this columnfamily store.
     * 
     * @param snapshotName the name of the associated with the snapshot 
     */
    public void snapshot(String snapshotName) throws IOException
    {
        try
        {
            forceBlockingFlush();
        }
        catch (ExecutionException e)
        {
            throw new RuntimeException(e);
        }
        catch (InterruptedException e)
        {
            throw new AssertionError(e);
        }


        // ss-table data may be missing on some disk before taking snapshot (it's normal and can occur sometimes),
        // therefore create snapshot-directory in any case to prevent false monitor alert
        String[] dataFileLocations = DatabaseDescriptor.getAllDataFileLocations();

        for (String dataDir : dataFileLocations) {
            String snapshotDirectoryPath = Table.getSnapshotPath(dataDir, table_, snapshotName);
            FileUtils.createDirectory(snapshotDirectoryPath);
        }

        for (SSTableReader ssTable : ssTables_) {

            File sourceFile = new File(ssTable.getFilename());
            File dataDirectory = sourceFile.getParentFile().getParentFile();
            String snapshotDirectoryPath = Table.getSnapshotPath(dataDirectory.getAbsolutePath(), table_, snapshotName);

            // create hard links for table data,index,filter files
            File targetLink = new File(snapshotDirectoryPath, sourceFile.getName());
            CLibrary.createHardLink(sourceFile, targetLink);

            sourceFile = new File(ssTable.indexFilename());
            targetLink = new File(snapshotDirectoryPath, sourceFile.getName());
            CLibrary.createHardLink(sourceFile, targetLink);

            sourceFile = new File(ssTable.filterFilename());
            targetLink = new File(snapshotDirectoryPath, sourceFile.getName());
            CLibrary.createHardLink(sourceFile, targetLink);

            if (logger_.isDebugEnabled())
                logger_.debug("Snapshot for " + table_ + " table data file " + sourceFile.getAbsolutePath() +
                    " created as " + targetLink.getAbsolutePath());
        }
    }

    public boolean hasUnreclaimedSpace()
    {
        return ssTables_.getLiveSize() < ssTables_.getTotalSize();
    }

    public long getTotalDiskSpaceUsed()
    {
        return ssTables_.getTotalSize();
    }

    public long getLiveDiskSpaceUsed()
    {
        return ssTables_.getLiveSize();
    }

    public int getLiveSSTableCount()
    {
        return ssTables_.size();
    }

    /** raw cached row -- does not fetch the row if it is not present.  not counted in cache statistics.  */
    public ColumnFamily getRawCachedRow(String key)
    {
        return ssTables_.getRowCache().getCapacity() == 0 ? null : ssTables_.getRowCache().getInternal(key);
    }

    void invalidateCachedRow(String key)
    {
        ssTables_.getRowCache().remove(key);
    }

    public void forceMajorCompaction()
    {
        CompactionManager.instance.submitMajor(this);
    }

    public void invalidateRowCache()
    {
        ssTables_.getRowCache().clear();
    }

    public int getRowCacheCapacity()
    {
        return ssTables_.getRowCache().getCapacity();
    }

    public int getKeyCacheCapacity()
    {
        return ssTables_.getKeyCache().getCapacity();
    }

    public int getRowCacheSize()
    {
        return ssTables_.getRowCache().getSize();
    }

    public int getKeyCacheSize()
    {
        return ssTables_.getKeyCache().getSize();
    }

    public static Iterable<ColumnFamilyStore> all()
    {
        Iterable<ColumnFamilyStore>[] stores = new Iterable[DatabaseDescriptor.getTables().size()];
        int i = 0;
        for (Table table : Table.all())
        {
            stores[i++] = table.getColumnFamilyStores();
        }
        return Iterables.concat(stores);
    }

    public Iterable<IndexSummary.KeyPosition> allIndexPositions()
    {
        Collection<SSTableReader> sstables = getSSTables();
        Iterable<IndexSummary.KeyPosition>[] positions = new Iterable[sstables.size()];
        int i = 0;
        for (SSTableReader sstable: sstables)
        {
            positions[i++] = sstable.getIndexPositions();
        }
        return Iterables.concat(positions);
    }

    /**
     * for testing.  no effort is made to clear historical memtables.
     */
    void clearUnsafe()
    {
        memtable_.clearUnsafe();
        ssTables_.clearUnsafe();
        ssTables_.getRowCache().clear();
        ssTables_.getKeyCache().clear();
    }


    public Set<Memtable> getMemtablesPendingFlush()
    {
        return memtablesPendingFlush;
    }

    public long getBloomFilterFalsePositives()
    {
        long count = 0L;
        for (SSTableReader sstable: getSSTables())
        {
            count += sstable.getBloomFilterFalsePositiveCount();
        }
        return count;
    }

    public long getRecentBloomFilterFalsePositives()
    {
        long count = 0L;
        for (SSTableReader sstable: getSSTables())
        {
            count += sstable.getRecentBloomFilterFalsePositiveCount();
        }
        return count;
    }

    public double getBloomFilterFalseRatio()
    {
        Long falseCount = 0L;
        Long trueCount = 0L;
        for (SSTableReader sstable: getSSTables())
        {
            falseCount += sstable.getBloomFilterFalsePositiveCount();
            trueCount += sstable.getBloomFilterTruePositiveCount();
        }
        if (falseCount.equals(0L) && trueCount.equals(0L))
            return 0d;
        return falseCount.doubleValue() / (trueCount.doubleValue() + falseCount.doubleValue());
    }

    public double getRecentBloomFilterFalseRatio()
    {
        Long falseCount = 0L;
        Long trueCount = 0L;
        for (SSTableReader sstable: getSSTables())
        {
            falseCount += sstable.getRecentBloomFilterFalsePositiveCount();
            trueCount += sstable.getRecentBloomFilterTruePositiveCount();
        }
        if (falseCount.equals(0L) && trueCount.equals(0L))
            return 0d;
        return falseCount.doubleValue() / (trueCount.doubleValue() + falseCount.doubleValue());
    }

    public long getRecentBloomFilterNegatives()
    {
        long count = 0L;
        for (SSTableReader sstable: getSSTables())
        {
            count += sstable.getBloomFilterTracker().getRecentNegativeCount();
        }
        return count;
    }
    
    public long getBloomFilterNegatives()
    {
        long count = 0L;
        for (SSTableReader sstable: getSSTables())
        {
            count += sstable.getBloomFilterTracker().getNegativeCount();
        }
        return count;
    }

    @Override
    public double getRecentBloomFilterNegativeRatio()
    {
        long posCount = 0L;
        long negCount = 0L;
        for (SSTableReader sstable: getSSTables())
        {
            posCount += sstable.getRecentBloomFilterFalsePositiveCount() + sstable.getRecentBloomFilterTruePositiveCount();
            negCount += sstable.getBloomFilterTracker().getRecentNegativeCount();
        }
        
        if ( posCount== 0L && negCount== 0L )
            return 0d;
        
        return posCount / ( (double) negCount + posCount);
    }

    @Override
    public long getBloomFilterColumnNegatives()
    {
        long count = 0L;
        for (SSTableReader sstable: getSSTables())
        {
            count += sstable.getBloomFilterTracker().getColumnNegativeCount();
        }
        return count;
    }

    @Override
    public long getRecentBloomFilterColumnNegatives()
    {
        long count = 0L;
        for (SSTableReader sstable: getSSTables())
        {
            count += sstable.getBloomFilterTracker().getRecentColumnNegativeCount();
        }
        return count;
    }

    @Override
    public long getBloomFilterColumnReads()
    {
        long count = 0L;
        for (SSTableReader sstable: getSSTables())
        {
            count += sstable.getBloomFilterTracker().getColumnReadsCount();
        }
        return count;
    }

    @Override
    public long getRecentBloomFilterColumnReads()
    {
        long count = 0L;
        for (SSTableReader sstable: getSSTables())
        {
            count += sstable.getBloomFilterTracker().getRecentColumnReadsCount();
        }
        return count;
    }

    @Override
    public double getRecentBloomFilterColumnNegativeRatio()
    {
        long posCount = 0L;
        long negCount = 0L;
        for (SSTableReader sstable: getSSTables())
        {
            posCount += sstable.getBloomFilterTracker().getRecentColumnReadsCount();
            negCount += sstable.getBloomFilterTracker().getRecentColumnNegativeCount();
        }
        
        if ( posCount== 0L && negCount== 0L )
            return 0d;
        
        return posCount / ( (double) negCount + posCount);
    }

    public long estimateKeys()
    {
        return ssTables_.estimatedKeys();
    }
    
    public void resetStats()
    {
        readStats_ = new LatencyTracker();
        writeStats_ = new LatencyTracker();
    }
}