Table.java example

Explorer
apache-cassandra-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.cassandra.db;

import java.io.File;
import java.io.IOException;
import java.net.InetAddress;
import java.util.*;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.regex.Pattern;

import com.google.common.base.Function;
import com.google.common.collect.Iterables;

import org.apache.log4j.Logger;
import org.apache.cassandra.concurrent.NamedThreadFactory;
import org.apache.cassandra.concurrent.StageManager;
import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.db.commitlog.CommitLog;
import org.apache.cassandra.db.filter.IdentityQueryFilter;
import org.apache.cassandra.db.filter.QueryFilter;
import org.apache.cassandra.db.filter.QueryPath;
import org.apache.cassandra.dht.Range;
import org.apache.cassandra.io.SSTableReader;
import org.apache.cassandra.io.util.FileUtils;
import org.apache.cassandra.utils.CopyOnWriteMap;
import org.apache.cassandra.utils.FBUtilities;
import org.apache.cassandra.utils.WrappedRunnable;
import org.cliffc.high_scale_lib.NonBlockingHashMap;

public class Table 
{
    public static final String SYSTEM_TABLE = "system";

    private static final Logger logger = Logger.getLogger(Table.class);
    private static final String SNAPSHOT_SUBDIR_NAME = "snapshots";
    /**
     * accesses to CFS.memtable should acquire this for thread safety.
     * Table.maybeSwitchMemtable should aquire the writeLock; see that method for the full explanation.
     *
     * (Enabling fairness in the RRWL is observed to decrease throughput, so we leave it off.)
     */
    static final ReentrantReadWriteLock flusherLock = new ReentrantReadWriteLock();

    private static Timer flushTimer = new Timer("FLUSH-TIMER");
    private final boolean waitForCommitLog;
    
    // This is a result of pushing down the point in time when storage directories get created.  It used to happen in
    // CassandraDaemon, but it is possible to call Table.open without a running daemon, so it made sense to ensure
    // proper directories here.
    static
    {
        try
        {
            DatabaseDescriptor.createAllDirectories();
        }
        catch (IOException ex)
        {
            throw new RuntimeException(ex);
        }
    }

    /*
     * This class represents the metadata of this Table. The metadata
     * is basically the column family name and the ID associated with
     * this column family. We use this ID in the Commit Log header to
     * determine when a log file that has been rolled can be deleted.
    */
    public static class TableMetadata
    {
        private static HashMap<String,TableMetadata> tableMetadataMap = new HashMap<String,TableMetadata>();
        private static Map<Integer, String> idCfMap_ = new HashMap<Integer, String>();

        static
        {
            try
            {
                DatabaseDescriptor.storeMetadata();
            }
            catch (IOException e)
            {
                throw new RuntimeException(e);
            }
        }

        public static synchronized Table.TableMetadata instance(String tableName) throws IOException
        {
            if ( tableMetadataMap.get(tableName) == null )
            {
                tableMetadataMap.put(tableName, new Table.TableMetadata());
            }
            return tableMetadataMap.get(tableName);
        }

        /* The mapping between column family and the column type. */
        private Map<String, String> cfTypeMap_ = new HashMap<String, String>();
        private Map<String, Integer> cfIdMap_ = new HashMap<String, Integer>();

        public void add(String cf, int id)
        {
            add(cf, id, "Standard");
        }
        
        public void add(String cf, int id, String type)
        {
            if (logger.isDebugEnabled())
              logger.debug("adding " + cf + " as " + id);
            assert !idCfMap_.containsKey(id);
            cfIdMap_.put(cf, id);
            idCfMap_.put(id, cf);
            cfTypeMap_.put(cf, type);
        }
        
        public boolean isEmpty()
        {
            return cfIdMap_.isEmpty();
        }

        int getColumnFamilyId(String columnFamily)
        {
            return cfIdMap_.get(columnFamily);
        }

        public static String getColumnFamilyName(int id)
        {
            return idCfMap_.get(id);
        }
        
        String getColumnFamilyType(String cfName)
        {
            return cfTypeMap_.get(cfName);
        }

        Set<String> getColumnFamilies()
        {
            return cfIdMap_.keySet();
        }
        
        int size()
        {
            return cfIdMap_.size();
        }
        
        boolean isValidColumnFamily(String cfName)
        {
            return cfIdMap_.containsKey(cfName);
        }

        public String toString()
        {
            return "TableMetadata(" + FBUtilities.mapToString(cfIdMap_) + ")";
        }

        public static int getColumnFamilyCount()
        {
            return idCfMap_.size();
        }

        public static String getColumnFamilyIDString()
        {
            return FBUtilities.mapToString(tableMetadataMap);
        }
    }

    /** Table objects, one per keyspace.  only one instance should ever exist for any given keyspace. */
    private static final Map<String, Table> instances = new NonBlockingHashMap<String, Table>();

    /* Table name. */
    public final String name;
    /* Handle to the Table Metadata */
    private final Table.TableMetadata tableMetadata;
    /* ColumnFamilyStore per column family */
    private final Map<String, ColumnFamilyStore> columnFamilyStores = new HashMap<String, ColumnFamilyStore>();
    /**
     * Key: ColumnFamilyStore
     * Value: its listener
     */
    private Map<String, IStoreApplyListener> storeFilters = null;

    public static Table open(String table) throws IOException
    {
        Table tableInstance = instances.get(table);
        if (tableInstance == null)
        {
            // instantiate the Table.  we could use putIfAbsent but it's important to making sure it is only done once
            // per keyspace, so we synchronize and re-check before doing it.
            synchronized (Table.class)
            {
                tableInstance = instances.get(table);
                if (tableInstance == null)
                {
                    tableInstance = new Table(table);
                    instances.put(table, tableInstance);

                    //table has to be constructed and in the cache before cacheRow can be called
                    for (ColumnFamilyStore cfs : tableInstance.getColumnFamilyStores())
                        cfs.initRowCache();
                }
            }
        }
        return tableInstance;
    }
    
    public static ColumnFamilyStore getColumnFamilyStore(int cfid) {
        String cfName = TableMetadata.getColumnFamilyName(cfid);
        for (Table t : Table.all()) {
            ColumnFamilyStore store = t.getColumnFamilyStore(cfName);
            if ( store != null )
                return store;
        }
        
        throw new IllegalArgumentException("Cannot find column family with id="+cfid);
    }

    public Set<String> getColumnFamilies()
    {
        return tableMetadata.getColumnFamilies();
    }

    public Collection<ColumnFamilyStore> getColumnFamilyStores()
    {
        return Collections.unmodifiableCollection(columnFamilyStores.values());
    }

    public ColumnFamilyStore getColumnFamilyStore(String cfName)
    {
        return columnFamilyStores.get(cfName);
    }

    /**
     * Do a cleanup of keys that do not belong locally.
     */
    public void forceCleanup()
    {
        if (name.equals(SYSTEM_TABLE))
            throw new UnsupportedOperationException("Cleanup of the system table is neither necessary nor wise");

        // Sort the column families in order of SSTable size, so cleanup of smaller CFs
        // can free up space for larger ones
        List<ColumnFamilyStore> sortedColumnFamilies = new ArrayList<ColumnFamilyStore>(columnFamilyStores.values());
        Collections.sort(sortedColumnFamilies, new Comparator<ColumnFamilyStore>()
        {
            // Compare first on size and, if equal, sort by name (arbitrary & deterministic).
            public int compare(ColumnFamilyStore cf1, ColumnFamilyStore cf2)
            {
                long diff = (cf1.getTotalDiskSpaceUsed() - cf2.getTotalDiskSpaceUsed());
                if (diff > 0)
                    return 1;
                if (diff < 0)
                    return -1;
                return cf1.columnFamily_.compareTo(cf2.columnFamily_);
            }
        });

        // Cleanup in sorted order to free up space for the larger ones
        for (ColumnFamilyStore cfs : sortedColumnFamilies)
            cfs.forceCleanup();
    }
    
    
    /**
     * Take a snapshot of the entire set of column families with a given timestamp.
     * 
     * @param clientSuppliedName the tag associated with the name of the snapshot.  This
     *                           value can be null.
     */
    public void snapshot(String clientSuppliedName) throws IOException
    {
        String snapshotName = Long.toString(System.currentTimeMillis());
        if (clientSuppliedName != null && !clientSuppliedName.equals(""))
        {
            snapshotName = snapshotName + "-" + clientSuppliedName;
        }

        for (ColumnFamilyStore cfStore : columnFamilyStores.values())
        {
            cfStore.snapshot(snapshotName);
        }
        
        submitArchiveSnapshot();
    }

    /**
     * Take a snapshot of the entire set of column families with a given timestamp.
     *
     * @param cfNameRegExp regexp for column families selection for snapshot
     * @param clientSuppliedName the tag associated with the name of the snapshot.  This
     *                           value can be null.
     */
    public void snapshot(String cfNameRegExp, String clientSuppliedName) throws IOException
    {
        Pattern cfNamePattern = Pattern.compile(cfNameRegExp);
        String snapshotName = Long.toString(System.currentTimeMillis());
        if (clientSuppliedName != null && !clientSuppliedName.equals(""))
        {
            snapshotName = snapshotName + "-" + clientSuppliedName;
        }

        for (ColumnFamilyStore cfStore : columnFamilyStores.values())
        {
            if (cfNamePattern.matcher(cfStore.getColumnFamilyName().toLowerCase()).matches()) {
                cfStore.snapshot(snapshotName);
            }
        }

        submitArchiveSnapshot();
    }

    private Future<?> submitArchiveSnapshot()
    {
        if (DatabaseDescriptor.isDataArchiveEnabled())
        {
            return StageManager.getStage(StageManager.SNAPSHOT_ARCHIVE_STAGE).submit(
                    new WrappedRunnable()
                    {

                        @Override
                        protected void runMayThrow() throws Exception
                        {
                            archiveSnapshot(false);
                        }
                    } 
                    );
            
        }
        
        return null;
    }

    private void forceBlockingArchiveSnapshot() throws IOException
    {
        if (DatabaseDescriptor.isDataArchiveEnabled())
        {
            archiveSnapshot(true);
        }
    }

    /*
     * Move snapshot files to separate archive disk
     */
    public void archiveSnapshot(boolean force) throws IOException
    {
        int chunkSize = 128*1024; // 128kbytes
        int chunksSec = force ? 10000*(1024/128) : DatabaseDescriptor.getDataArchiveThrottle()*(1024/128);
        
        for (String dataDirPath : DatabaseDescriptor.getAllDataFileLocations())
        {
            String snapshotPath = dataDirPath + File.separator + name + File.separator + SNAPSHOT_SUBDIR_NAME;
            File snapshotDir = new File(snapshotPath);
            if (snapshotDir.exists())
            {
                File archiveLocation = DatabaseDescriptor.getDataArchiveFileLocationForSnapshot(name);

                logger.info("Moving snapshot directory " + snapshotPath + " to " + archiveLocation + (force ? " (forced, blocking)" :""));

                if (DatabaseDescriptor.isDataArchiveHardLinksEnabled()) 
                {
                    long copiedAndLinked[] = FileUtils.copyDirLinkDublicates(
                            snapshotDir, archiveLocation, chunkSize, chunksSec);
                    StringBuilder stringBuilder = new StringBuilder();
                    stringBuilder.append("Move of ");
                    stringBuilder.append(snapshotPath);
                    stringBuilder.append(" to ");
                    stringBuilder.append(archiveLocation);
                    stringBuilder.append(" completed. Copied ");
                    stringBuilder.append(FileUtils.stringifyFileSize(copiedAndLinked[0]));
                    stringBuilder.append(", linked ");
                    stringBuilder.append(FileUtils.stringifyFileSize(copiedAndLinked[1]));
                    stringBuilder.append(" .Removing snapshot directory ");
                    stringBuilder.append(snapshotPath);
                    logger.info(stringBuilder.toString());
                }
                else
                {
                    long copied = FileUtils.copyDir(snapshotDir, archiveLocation, chunkSize, chunksSec);

                    logger.info("Move of " + (copied / 1024 / 1024) + "MB to " + archiveLocation
                            + " completed. Removing snapshot directory " + snapshotPath);
                }
                FileUtils.deleteDir(snapshotDir);
            }
        }
    }

    /**
     * Clear all the snapshots for a given table.
     */
    public void clearSnapshot() throws IOException
    {
        for (String dataDirPath : DatabaseDescriptor.getAllDataFileLocations())
        {
            String snapshotPath = dataDirPath + File.separator + name + File.separator + SNAPSHOT_SUBDIR_NAME;
            File snapshotDir = new File(snapshotPath);
            if (snapshotDir.exists())
            {
                if (logger.isDebugEnabled())
                    logger.debug("Removing snapshot directory " + snapshotPath);
                FileUtils.deleteDir(snapshotDir);
            }
        }
    }

    /*
     * This method is invoked only during a bootstrap process. We basically
     * do a complete compaction since we can figure out based on the ranges
     * whether the files need to be split.
    */
    public List<String> forceAntiCompaction(Collection<Range> ranges, InetAddress target)
    {
        List<String> allResults = new ArrayList<String>();
        Set<String> columnFamilies = tableMetadata.getColumnFamilies();
        for ( String columnFamily : columnFamilies )
        {
            ColumnFamilyStore cfStore = columnFamilyStores.get( columnFamily );
            try
            {
                allResults.addAll(CompactionManager.instance.submitAnticompaction(cfStore, ranges, target).get());
            }
            catch (Exception e)
            {
                throw new RuntimeException(e);
            }
        }
        return allResults;
    }
    
    /*
     * This method is an ADMIN operation to force compaction
     * of all SSTables on disk. 
    */
    public void forceCompaction()
    {
        Set<String> columnFamilies = tableMetadata.getColumnFamilies();
        for ( String columnFamily : columnFamilies )
        {
            ColumnFamilyStore cfStore = columnFamilyStores.get( columnFamily );
            if ( cfStore != null )
                CompactionManager.instance.submitMajor(cfStore);
        }
    }

    List<SSTableReader> getAllSSTablesOnDisk()
    {
        List<SSTableReader> list = new ArrayList<SSTableReader>();
        Set<String> columnFamilies = tableMetadata.getColumnFamilies();
        for ( String columnFamily : columnFamilies )
        {
            ColumnFamilyStore cfStore = columnFamilyStores.get( columnFamily );
            if ( cfStore != null )
                list.addAll(cfStore.getSSTables());
        }
        return list;
    }

    private Table(final String table) throws IOException
    {
        name = table;
        waitForCommitLog = DatabaseDescriptor.getCommitLogSync() == DatabaseDescriptor.CommitLogSync.batch;
        tableMetadata = Table.TableMetadata.instance(table);
        
        // MM : Speed up startup by parallelling all CF initializations
        int threads = Math.max( Runtime.getRuntime().availableProcessors(),  DatabaseDescriptor.getAllDataFileLocations().length);
        logger.info("Starting "+table+" keyspace using "+ threads+" threads");
        ExecutorService initExecutor = Executors.newFixedThreadPool(threads, new NamedThreadFactory("KS-INIT-"+table));

        List<Callable<ColumnFamilyStore>> tasks = new ArrayList<Callable<ColumnFamilyStore>>();
        for (final String columnFamily : tableMetadata.getColumnFamilies())
        {
//            columnFamilyStores.put(columnFamily, ColumnFamilyStore.createColumnFamilyStore(table, columnFamily));
            tasks.add(new Callable<ColumnFamilyStore>()
            {
                
                @Override
                public ColumnFamilyStore call() throws Exception
                {
                    return ColumnFamilyStore.createColumnFamilyStore(table, columnFamily);
                }
            });
        }
        
        try {
            List<Future<ColumnFamilyStore>> cfs = initExecutor.invokeAll(tasks);
            
            for (Future<ColumnFamilyStore> future : cfs) 
            {
                columnFamilyStores.put(future.get().getColumnFamilyName(), future.get());
            }
        } catch (Exception e1) {
            throw new RuntimeException(table+" initialization failure", e1);
        }
        
        initExecutor.shutdown();

        // if are noopied snapshots - do snapshot archive now
        forceBlockingArchiveSnapshot();

        // check 10x as often as the lifetime, so we can exceed lifetime by 10% at most
        int checkMs = DatabaseDescriptor.getMemtableLifetimeMS() / 10;
        flushTimer.schedule(new TimerTask()
        {
            public void run()
            {
                for (ColumnFamilyStore cfs : columnFamilyStores.values())
                {
                    try
                    {
                        cfs.forceFlushIfExpired();
                    }
                    catch (ExecutionException e) {
                        throw new RuntimeException(e);
                    } catch (InterruptedException e) {
                        throw new RuntimeException(e);
                    }
                }
            }
        }, checkMs, checkMs);
    }

    public int getColumnFamilyId(String columnFamily)
    {
        return tableMetadata.getColumnFamilyId(columnFamily);
    }

    /**
     * Selects the specified column family for the specified key.
    */
    @Deprecated // single CFs could be larger than memory
    public ColumnFamily get(String key, String cfName) throws IOException
    {
        ColumnFamilyStore cfStore = columnFamilyStores.get(cfName);
        assert cfStore != null : "Column family " + cfName + " has not been defined";
        return cfStore.getColumnFamily(new IdentityQueryFilter(key, new QueryPath(cfName)));
    }

    public Row getRow(QueryFilter filter) throws IOException
    {
        ColumnFamilyStore cfStore = columnFamilyStores.get(filter.getColumnFamilyName());
        ColumnFamily columnFamily = cfStore.getColumnFamily(filter);
        return new Row(filter.key, columnFamily);
    }

    /**
     * This method adds the row to the Commit Log associated with this table.
     * Once this happens the data associated with the individual column families
     * is also written to the column family store's memtable.
    */
    public void apply(RowMutation mutation, Object serializedMutation, boolean writeCommitLog) throws IOException
    {
        HashMap<ColumnFamilyStore,Memtable> memtablesToFlush = new HashMap<ColumnFamilyStore, Memtable>(2);
        RowMutation filteredMutation = null;
        if (storeFilters!=null)
        {
            // invoke listener prior critical section
            for (ColumnFamily columnFamily : mutation.getColumnFamilies())
            {
                IStoreApplyListener listener = storeFilters.get( columnFamily.name() );
                if (listener!=null)
                {
                    if (!listener.preapply(mutation.key(), columnFamily)){
                        //create copy for mutation to avoid ConcurrentModificationException in send threads
                        if (filteredMutation == null){
                            filteredMutation  = new RowMutation(mutation.getTable(), mutation.key(), new HashMap<String, ColumnFamily>(mutation.modifications_));
                        }
                        filteredMutation.modifications_.remove(columnFamily.name());
                    }
                }
            }
        }
        
        if (filteredMutation != null){
            //something was filtered
            mutation = filteredMutation;
            
            if (mutation.isEmpty()){
                return;
            }
            
            // rebuild the serialized mutation
            serializedMutation = mutation.getSerializedBuffer();
            
            
        }
        
       

        // write the mutation to the commitlog and memtables
        flusherLock.readLock().lock();
        try
        {
            if (writeCommitLog)
            {
                CommitLog.instance().add(mutation, serializedMutation);
            }

            for (ColumnFamily columnFamily : mutation.getColumnFamilies())
            {
                Memtable memtableToFlush;
                ColumnFamilyStore cfs = columnFamilyStores.get(columnFamily.name());
                if ((memtableToFlush=cfs.apply(mutation.key(), columnFamily)) != null)
                    memtablesToFlush.put(cfs, memtableToFlush);

                ColumnFamily cachedRow = cfs.getRawCachedRow(mutation.key());
                if (cachedRow != null)
                    cachedRow.addAll(columnFamily);
            }
        }
        finally
        {
            flusherLock.readLock().unlock();

        }


        
        // flush memtables that got filled up.  usually mTF will be empty and this will be a no-op
        for (Map.Entry<ColumnFamilyStore, Memtable> entry : memtablesToFlush.entrySet())
            entry.getKey().maybeSwitchMemtable(entry.getValue(), writeCommitLog);
    }

    public List<Future<?>> flush() throws IOException
    {
        List<Future<?>> futures = new ArrayList<Future<?>>();
        for (String cfName : columnFamilyStores.keySet())
        {
            Future<?> future = columnFamilyStores.get(cfName).forceFlush();
            if (future != null)
                futures.add(future);
        }
        return futures;
    }

    /**
     * Does flush memtables one by one and waits for process completion. This will take more time than
     * {@link #flush()} but is more gentle for running loaded system.
     * 
     * @return
     * @throws IOException
     */
    public void flushAndWait() 
    {
        for (String cfName : columnFamilyStores.keySet())
        {
            Future<?> future = columnFamilyStores.get(cfName).forceFlush();
            if ( future != null )
                try {
                    future.get();
                } catch (Exception e) {
                    throw new RuntimeException("Cannot flush "+cfName,e);
                }
        }
    }

    // for binary load path.  skips commitlog.
    void load(RowMutation rowMutation) throws IOException
    {
        String key = rowMutation.key();
                
        for (ColumnFamily columnFamily : rowMutation.getColumnFamilies())
        {
            Collection<IColumn> columns = columnFamily.getSortedColumns();
            for (IColumn column : columns)
            {
                ColumnFamilyStore cfStore = columnFamilyStores.get(new String(column.name(), "UTF-8"));
                cfStore.applyBinary(key, column.value());
            }
        }
    }


    public static String getSnapshotPath(String dataDirPath, String tableName, String snapshotName)
    {
        return dataDirPath + File.separator + tableName + File.separator + SNAPSHOT_SUBDIR_NAME + File.separator + snapshotName;
    }

    public static Iterable<Table> all()
    {
        Function<String, Table> transformer = new Function<String, Table>()
        {
            public Table apply(String tableName)
            {
                try
                {
                    return Table.open(tableName);
                }
                catch (IOException e)
                {
                    throw new FSReadError(e);
                }
            }
        };
        return Iterables.transform(DatabaseDescriptor.getTables(), transformer);
    }
    
    void setStoreListener(ColumnFamilyStore store, IStoreApplyListener listener)
    {
        assert storeFilters == null || storeFilters.get(store.columnFamily_)==null || listener==null;
        
        synchronized (this) {
            if (storeFilters==null)
                storeFilters = new CopyOnWriteMap<String, IStoreApplyListener>();
            
            storeFilters.put(store.columnFamily_,listener);
        }
    }
}