/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.cassandra.db; import java.io.File; import java.io.IOError; import java.io.IOException; import java.nio.ByteBuffer; import java.util.*; import java.util.concurrent.*; import java.util.concurrent.locks.ReentrantReadWriteLock; import com.google.common.base.Function; import com.google.common.collect.Iterables; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.cassandra.config.CFMetaData; import org.apache.cassandra.config.ConfigurationException; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.config.KSMetaData; import org.apache.cassandra.db.commitlog.CommitLog; import org.apache.cassandra.db.compaction.CompactionInfo; import org.apache.cassandra.db.compaction.CompactionType; import org.apache.cassandra.db.filter.QueryFilter; import org.apache.cassandra.db.filter.QueryPath; import org.apache.cassandra.dht.LocalToken; import org.apache.cassandra.io.sstable.ReducingKeyIterator; import org.apache.cassandra.io.sstable.SSTableDeletingReference; import org.apache.cassandra.io.sstable.SSTableReader; import org.apache.cassandra.io.util.FileUtils; import org.apache.cassandra.locator.AbstractReplicationStrategy; import org.apache.cassandra.service.StorageService; import org.apache.cassandra.utils.ByteBufferUtil; import org.apache.cassandra.utils.NodeId; import org.cliffc.high_scale_lib.NonBlockingHashMap; public class Table { public static final String SYSTEM_TABLE = "system"; public static final String SNAPSHOT_SUBDIR_NAME = "snapshots"; private static final Logger logger = LoggerFactory.getLogger(Table.class); /** * accesses to CFS.memtable should acquire this for thread safety. * Table.maybeSwitchMemtable should aquire the writeLock; see that method for the full explanation. * * (Enabling fairness in the RRWL is observed to decrease throughput, so we leave it off.) */ static final ReentrantReadWriteLock switchLock = new ReentrantReadWriteLock(); // It is possible to call Table.open without a running daemon, so it makes sense to ensure // proper directories here as well as in CassandraDaemon. static { if (!StorageService.instance.isClientMode()) { try { DatabaseDescriptor.createAllDirectories(); } catch (IOException ex) { throw new IOError(ex); } } } /** Table objects, one per keyspace. only one instance should ever exist for any given keyspace. */ private static final Map<String, Table> instances = new NonBlockingHashMap<String, Table>(); /* Table name. */ public final String name; /* ColumnFamilyStore per column family */ private final Map<Integer, ColumnFamilyStore> columnFamilyStores = new ConcurrentHashMap<Integer, ColumnFamilyStore>(); private final Object[] indexLocks; private ScheduledFuture<?> flushTask; private volatile AbstractReplicationStrategy replicationStrategy; public static Table open(String table) { Table tableInstance = instances.get(table); if (tableInstance == null) { // instantiate the Table. we could use putIfAbsent but it's important to making sure it is only done once // per keyspace, so we synchronize and re-check before doing it. synchronized (Table.class) { tableInstance = instances.get(table); if (tableInstance == null) { // open and store the table tableInstance = new Table(table); instances.put(table, tableInstance); //table has to be constructed and in the cache before cacheRow can be called for (ColumnFamilyStore cfs : tableInstance.getColumnFamilyStores()) cfs.initCaches(); } } } return tableInstance; } public static Table clear(String table) throws IOException { synchronized (Table.class) { Table t = instances.remove(table); if (t != null) { t.flushTask.cancel(false); for (ColumnFamilyStore cfs : t.getColumnFamilyStores()) t.unloadCf(cfs); } return t; } } public Collection<ColumnFamilyStore> getColumnFamilyStores() { return Collections.unmodifiableCollection(columnFamilyStores.values()); } public ColumnFamilyStore getColumnFamilyStore(String cfName) { Integer id = CFMetaData.getId(name, cfName); if (id == null) throw new IllegalArgumentException(String.format("Unknown table/cf pair (%s.%s)", name, cfName)); return getColumnFamilyStore(id); } public ColumnFamilyStore getColumnFamilyStore(Integer id) { ColumnFamilyStore cfs = columnFamilyStores.get(id); if (cfs == null) throw new IllegalArgumentException("Unknown CF " + id); return cfs; } /** * Do a cleanup of keys that do not belong locally. */ public void forceCleanup(NodeId.OneShotRenewer renewer) throws IOException, ExecutionException, InterruptedException { if (name.equals(SYSTEM_TABLE)) throw new UnsupportedOperationException("Cleanup of the system table is neither necessary nor wise"); // Sort the column families in order of SSTable size, so cleanup of smaller CFs // can free up space for larger ones List<ColumnFamilyStore> sortedColumnFamilies = new ArrayList<ColumnFamilyStore>(columnFamilyStores.values()); Collections.sort(sortedColumnFamilies, new Comparator<ColumnFamilyStore>() { // Compare first on size and, if equal, sort by name (arbitrary & deterministic). public int compare(ColumnFamilyStore cf1, ColumnFamilyStore cf2) { long diff = (cf1.getTotalDiskSpaceUsed() - cf2.getTotalDiskSpaceUsed()); if (diff > 0) return 1; if (diff < 0) return -1; return cf1.columnFamily.compareTo(cf2.columnFamily); } }); // Cleanup in sorted order to free up space for the larger ones for (ColumnFamilyStore cfs : sortedColumnFamilies) cfs.forceCleanup(renewer); } /** * Take a snapshot of the entire set of column families with a given timestamp * * @param snapshotName the tag associated with the name of the snapshot. This value may not be null */ public void snapshot(String snapshotName) { assert snapshotName != null; for (ColumnFamilyStore cfStore : columnFamilyStores.values()) cfStore.snapshot(snapshotName); } /** * @param clientSuppliedName; may be null. * @return */ public static String getTimestampedSnapshotName(String clientSuppliedName) { String snapshotName = Long.toString(System.currentTimeMillis()); if (clientSuppliedName != null && !clientSuppliedName.equals("")) { snapshotName = snapshotName + "-" + clientSuppliedName; } return snapshotName; } /**? * Clear snapshots for this table. If no tag is given we will clear all * snapshots * * @param snapshotName the user supplied snapshot name * @return true if the snapshot exists */ public boolean snapshotExists(String snapshotName) { for (String dataDirPath : DatabaseDescriptor.getAllDataFileLocations()) { String snapshotPath = dataDirPath + File.separator + name + File.separator + SNAPSHOT_SUBDIR_NAME + File.separator + snapshotName; File snapshot = new File(snapshotPath); if (snapshot.exists()) { return true; } } return false; } /** * Clear all the snapshots for a given table. */ public void clearSnapshot(String tag) throws IOException { for (String dataDirPath : DatabaseDescriptor.getAllDataFileLocations()) { // If tag is empty we will delete the entire snapshot directory String snapshotPath = dataDirPath + File.separator + name + File.separator + SNAPSHOT_SUBDIR_NAME + File.separator + tag; File snapshotDir = new File(snapshotPath); if (snapshotDir.exists()) { if (logger.isDebugEnabled()) logger.debug("Removing snapshot directory " + snapshotPath); FileUtils.deleteRecursive(snapshotDir); } } } /** * @return A list of open SSTableReaders (TODO: ensure that the caller doesn't modify these). */ public List<SSTableReader> getAllSSTables() { List<SSTableReader> list = new ArrayList<SSTableReader>(); for (ColumnFamilyStore cfStore : columnFamilyStores.values()) list.addAll(cfStore.getSSTables()); return list; } private Table(String table) { name = table; KSMetaData ksm = DatabaseDescriptor.getKSMetaData(table); assert ksm != null : "Unknown keyspace " + table; try { createReplicationStrategy(ksm); } catch (ConfigurationException e) { throw new RuntimeException(e); } indexLocks = new Object[DatabaseDescriptor.getConcurrentWriters() * 128]; for (int i = 0; i < indexLocks.length; i++) indexLocks[i] = new Object(); // create data directories. for (String dataDir : DatabaseDescriptor.getAllDataFileLocations()) { try { String keyspaceDir = dataDir + File.separator + table; if (!StorageService.instance.isClientMode()) FileUtils.createDirectory(keyspaceDir); // remove the deprecated streaming directory. File streamingDir = new File(keyspaceDir, "stream"); if (streamingDir.exists()) FileUtils.deleteRecursive(streamingDir); } catch (IOException ex) { throw new IOError(ex); } } for (CFMetaData cfm : new ArrayList<CFMetaData>(DatabaseDescriptor.getTableDefinition(table).cfMetaData().values())) { logger.debug("Initializing {}.{}", name, cfm.cfName); initCf(cfm.cfId, cfm.cfName); } Runnable runnable = new Runnable() { public void run() { for (ColumnFamilyStore cfs : columnFamilyStores.values()) { cfs.forceFlushIfExpired(); } } }; flushTask = StorageService.tasks.scheduleWithFixedDelay(runnable, 10, 10, TimeUnit.SECONDS); } public void createReplicationStrategy(KSMetaData ksm) throws ConfigurationException { if (replicationStrategy != null) StorageService.instance.getTokenMetadata().unregister(replicationStrategy); replicationStrategy = AbstractReplicationStrategy.createReplicationStrategy(ksm.name, ksm.strategyClass, StorageService.instance.getTokenMetadata(), DatabaseDescriptor.getEndpointSnitch(), ksm.strategyOptions); } // best invoked on the compaction mananger. public void dropCf(Integer cfId) throws IOException { assert columnFamilyStores.containsKey(cfId); ColumnFamilyStore cfs = columnFamilyStores.remove(cfId); if (cfs == null) return; unloadCf(cfs); cfs.removeAllSSTables(); } // disassociate a cfs from this table instance. private void unloadCf(ColumnFamilyStore cfs) throws IOException { try { cfs.forceBlockingFlush(); } catch (ExecutionException e) { throw new IOException(e); } catch (InterruptedException e) { throw new IOException(e); } cfs.unregisterMBean(); } /** adds a cf to internal structures, ends up creating disk files). */ public void initCf(Integer cfId, String cfName) { assert !columnFamilyStores.containsKey(cfId) : String.format("tried to init %s as %s, but already used by %s", cfName, cfId, columnFamilyStores.get(cfId)); columnFamilyStores.put(cfId, ColumnFamilyStore.createColumnFamilyStore(this, cfName)); } /** basically a combined drop and add */ public void renameCf(Integer cfId, String newName) throws IOException { assert columnFamilyStores.containsKey(cfId); ColumnFamilyStore cfs = columnFamilyStores.remove(cfId); unloadCf(cfs); cfs.renameSSTables(newName); initCf(cfId, newName); } public Row getRow(QueryFilter filter) throws IOException { ColumnFamilyStore cfStore = getColumnFamilyStore(filter.getColumnFamilyName()); ColumnFamily columnFamily = cfStore.getColumnFamily(filter); return new Row(filter.key, columnFamily); } /** * This method adds the row to the Commit Log associated with this table. * Once this happens the data associated with the individual column families * is also written to the column family store's memtable. */ public void apply(RowMutation mutation, boolean writeCommitLog) throws IOException { List<Memtable> memtablesToFlush = Collections.emptyList(); if (logger.isDebugEnabled()) logger.debug("applying mutation of row {}", ByteBufferUtil.bytesToHex(mutation.key())); // write the mutation to the commitlog and memtables switchLock.readLock().lock(); try { if (writeCommitLog) CommitLog.instance.add(mutation); DecoratedKey<?> key = StorageService.getPartitioner().decorateKey(mutation.key()); for (ColumnFamily cf : mutation.getColumnFamilies()) { ColumnFamilyStore cfs = columnFamilyStores.get(cf.id()); if (cfs == null) { logger.error("Attempting to mutate non-existant column family " + cf.id()); continue; } SortedSet<ByteBuffer> mutatedIndexedColumns = null; for (ByteBuffer column : cfs.getIndexedColumns()) { if (cf.getColumnNames().contains(column) || cf.isMarkedForDelete()) { if (mutatedIndexedColumns == null) mutatedIndexedColumns = new TreeSet<ByteBuffer>(); mutatedIndexedColumns.add(column); if (logger.isDebugEnabled()) { // can't actually use validator to print value here, because we overload value // for deletion timestamp as well (which may not be a well-formed value for the column type) ByteBuffer value = cf.getColumn(column) == null ? null : cf.getColumn(column).value(); // may be null on row-level deletion logger.debug(String.format("mutating indexed column %s value %s", cf.getComparator().getString(column), value == null ? "null" : ByteBufferUtil.bytesToHex(value))); } } } synchronized (indexLockFor(mutation.key())) { ColumnFamily oldIndexedColumns = null; if (mutatedIndexedColumns != null) { // with the raw data CF, we can just apply every update in any order and let // read-time resolution throw out obsolete versions, thus avoiding read-before-write. // but for indexed data we need to make sure that we're not creating index entries // for obsolete writes. oldIndexedColumns = readCurrentIndexedColumns(key, cfs, mutatedIndexedColumns); logger.debug("Pre-mutation index row is {}", oldIndexedColumns); ignoreObsoleteMutations(cf, mutatedIndexedColumns, oldIndexedColumns); } Memtable fullMemtable = cfs.apply(key, cf); if (fullMemtable != null) memtablesToFlush = addFullMemtable(memtablesToFlush, fullMemtable); if (mutatedIndexedColumns != null) { // ignore full index memtables -- we flush those when the "master" one is full applyIndexUpdates(mutation.key(), cf, cfs, mutatedIndexedColumns, oldIndexedColumns); } } } } finally { switchLock.readLock().unlock(); } // flush memtables that got filled up outside the readlock (maybeSwitchMemtable acquires writeLock). // usually mTF will be empty and this will be a no-op. for (Memtable memtable : memtablesToFlush) memtable.cfs.maybeSwitchMemtable(memtable, writeCommitLog); } private static List<Memtable> addFullMemtable(List<Memtable> memtablesToFlush, Memtable fullMemtable) { if (memtablesToFlush.isEmpty()) memtablesToFlush = new ArrayList<Memtable>(2); memtablesToFlush.add(fullMemtable); return memtablesToFlush; } private static void ignoreObsoleteMutations(ColumnFamily cf, SortedSet<ByteBuffer> mutatedIndexedColumns, ColumnFamily oldIndexedColumns) { // DO NOT modify the cf object here, it can race w/ the CL write (see https://issues.apache.org/jira/browse/CASSANDRA-2604) if (oldIndexedColumns == null) return; for (Iterator<ByteBuffer> iter = mutatedIndexedColumns.iterator(); iter.hasNext(); ) { ByteBuffer name = iter.next(); IColumn newColumn = cf.getColumn(name); // null == row delete or it wouldn't be marked Mutated if (newColumn != null && cf.isMarkedForDelete()) { // row is marked for delete, but column was also updated. if column is timestamped less than // the row tombstone, treat it as if it didn't exist. Otherwise we don't care about row // tombstone for the purpose of the index update and we can proceed as usual. if (newColumn.timestamp() <= cf.getMarkedForDeleteAt()) { // don't remove from the cf object; that can race w/ CommitLog write. Leaving it is harmless. newColumn = null; } } IColumn oldColumn = oldIndexedColumns.getColumn(name); // deletions are irrelevant to the index unless we're changing state from live -> deleted, i.e., // just updating w/ a newer tombstone doesn't matter boolean bothDeleted = (newColumn == null || newColumn.isMarkedForDelete()) && (oldColumn == null || oldColumn.isMarkedForDelete()); // obsolete means either the row or the column timestamp we're applying is older than existing data boolean obsoleteRowTombstone = newColumn == null && oldColumn != null && cf.getMarkedForDeleteAt() < oldColumn.timestamp(); boolean obsoleteColumn = newColumn != null && (newColumn.timestamp() <= oldIndexedColumns.getMarkedForDeleteAt() || (oldColumn != null && oldColumn.reconcile(newColumn) == oldColumn)); if (bothDeleted || obsoleteRowTombstone || obsoleteColumn) { if (logger.isDebugEnabled()) logger.debug("skipping index update for obsolete mutation of " + cf.getComparator().getString(name)); iter.remove(); oldIndexedColumns.remove(name); } } } private static ColumnFamily readCurrentIndexedColumns(DecoratedKey<?> key, ColumnFamilyStore cfs, SortedSet<ByteBuffer> mutatedIndexedColumns) { QueryFilter filter = QueryFilter.getNamesFilter(key, new QueryPath(cfs.getColumnFamilyName()), mutatedIndexedColumns); return cfs.getColumnFamily(filter); } /** * removes obsolete index entries and creates new ones for the given row key and mutated columns. * @return list of full (index CF) memtables */ private static List<Memtable> applyIndexUpdates(ByteBuffer key, ColumnFamily cf, ColumnFamilyStore cfs, SortedSet<ByteBuffer> mutatedIndexedColumns, ColumnFamily oldIndexedColumns) { List<Memtable> fullMemtables = Collections.emptyList(); // add new index entries for (ByteBuffer columnName : mutatedIndexedColumns) { IColumn column = cf.getColumn(columnName); if (column == null || column.isMarkedForDelete()) continue; // null column == row deletion DecoratedKey<LocalToken> valueKey = cfs.getIndexKeyFor(columnName, column.value()); ColumnFamily cfi = cfs.newIndexedColumnFamily(columnName); if (column instanceof ExpiringColumn) { ExpiringColumn ec = (ExpiringColumn)column; cfi.addColumn(new ExpiringColumn(key, ByteBufferUtil.EMPTY_BYTE_BUFFER, ec.timestamp, ec.getTimeToLive(), ec.getLocalDeletionTime())); } else { cfi.addColumn(new Column(key, ByteBufferUtil.EMPTY_BYTE_BUFFER, column.timestamp())); } if (logger.isDebugEnabled()) logger.debug("applying index row {}:{}", valueKey, cfi); Memtable fullMemtable = cfs.getIndexedColumnFamilyStore(columnName).apply(valueKey, cfi); if (fullMemtable != null) fullMemtables = addFullMemtable(fullMemtables, fullMemtable); } // remove the old index entries if (oldIndexedColumns != null) { int localDeletionTime = (int) (System.currentTimeMillis() / 1000); for (Map.Entry<ByteBuffer, IColumn> entry : oldIndexedColumns.getColumnsMap().entrySet()) { ByteBuffer columnName = entry.getKey(); IColumn column = entry.getValue(); if (column.isMarkedForDelete()) continue; DecoratedKey<LocalToken> valueKey = cfs.getIndexKeyFor(columnName, column.value()); ColumnFamily cfi = cfs.newIndexedColumnFamily(columnName); cfi.addTombstone(key, localDeletionTime, column.timestamp()); Memtable fullMemtable = cfs.getIndexedColumnFamilyStore(columnName).apply(valueKey, cfi); if (logger.isDebugEnabled()) logger.debug("applying index tombstones {}:{}", valueKey, cfi); if (fullMemtable != null) fullMemtables = addFullMemtable(fullMemtables, fullMemtable); } } return fullMemtables; } public static void cleanupIndexEntry(ColumnFamilyStore cfs, ByteBuffer key, IColumn column) { if (column.isMarkedForDelete()) return; int localDeletionTime = (int) (System.currentTimeMillis() / 1000); DecoratedKey<LocalToken> valueKey = cfs.getIndexKeyFor(column.name(), column.value()); ColumnFamily cfi = cfs.newIndexedColumnFamily(column.name()); cfi.addTombstone(key, localDeletionTime, column.timestamp()); Memtable fullMemtable = cfs.getIndexedColumnFamilyStore(column.name()).apply(valueKey, cfi); if (logger.isDebugEnabled()) logger.debug("removed index entry for cleaned-up value {}:{}", valueKey, cfi); if (fullMemtable != null) fullMemtable.cfs.maybeSwitchMemtable(fullMemtable, false); } public IndexBuilder createIndexBuilder(ColumnFamilyStore cfs, SortedSet<ByteBuffer> columns, ReducingKeyIterator iter) { return new IndexBuilder(cfs, columns, iter); } public AbstractReplicationStrategy getReplicationStrategy() { return replicationStrategy; } public class IndexBuilder implements CompactionInfo.Holder { private final ColumnFamilyStore cfs; private final SortedSet<ByteBuffer> columns; private final ReducingKeyIterator iter; public IndexBuilder(ColumnFamilyStore cfs, SortedSet<ByteBuffer> columns, ReducingKeyIterator iter) { this.cfs = cfs; this.columns = columns; this.iter = iter; } public CompactionInfo getCompactionInfo() { return new CompactionInfo(cfs.table.name, cfs.columnFamily, CompactionType.INDEX_BUILD, iter.getBytesRead(), iter.getTotalBytes()); } public void build() { while (iter.hasNext()) { DecoratedKey<?> key = iter.next(); logger.debug("Indexing row {} ", key); List<Memtable> memtablesToFlush = Collections.emptyList(); switchLock.readLock().lock(); try { synchronized (indexLockFor(key.key)) { ColumnFamily cf = readCurrentIndexedColumns(key, cfs, columns); if (cf != null) memtablesToFlush = applyIndexUpdates(key.key, cf, cfs, cf.getColumnNames(), null); } } finally { switchLock.readLock().unlock(); } // during index build, we do flush index memtables separately from master; otherwise we could OOM for (Memtable memtable : memtablesToFlush) memtable.cfs.maybeSwitchMemtable(memtable, false); } try { iter.close(); } catch (IOException e) { throw new RuntimeException(e); } } } private Object indexLockFor(ByteBuffer key) { return indexLocks[Math.abs(key.hashCode() % indexLocks.length)]; } public List<Future<?>> flush() throws IOException { List<Future<?>> futures = new ArrayList<Future<?>>(); for (Integer cfId : columnFamilyStores.keySet()) { Future<?> future = columnFamilyStores.get(cfId).forceFlush(); if (future != null) futures.add(future); } return futures; } // for binary load path. skips commitlog. void load(RowMutation rowMutation) throws IOException { DecoratedKey<?> key = StorageService.getPartitioner().decorateKey(rowMutation.key()); for (ColumnFamily columnFamily : rowMutation.getColumnFamilies()) { Collection<IColumn> columns = columnFamily.getSortedColumns(); for (IColumn column : columns) { ColumnFamilyStore cfStore = columnFamilyStores.get(ByteBufferUtil.toInt(column.name())); cfStore.applyBinary(key, column.value()); } } } public String getDataFileLocation(long expectedSize) { String path = DatabaseDescriptor.getDataFileLocationForTable(name, expectedSize); if (path == null) { // retry after GCing to force unmap of compacted SSTables so they can be deleted StorageService.instance.requestGC(); try { Thread.sleep(SSTableDeletingReference.RETRY_DELAY * 2); } catch (InterruptedException e) { throw new AssertionError(e); } path = DatabaseDescriptor.getDataFileLocationForTable(name, expectedSize); } return path; } public static String getSnapshotPath(String dataDirPath, String tableName, String snapshotName) { return dataDirPath + File.separator + tableName + File.separator + SNAPSHOT_SUBDIR_NAME + File.separator + snapshotName; } public static Iterable<Table> all() { Function<String, Table> transformer = new Function<String, Table>() { public Table apply(String tableName) { return Table.open(tableName); } }; return Iterables.transform(DatabaseDescriptor.getTables(), transformer); } @Override public String toString() { return getClass().getSimpleName() + "(name='" + name + "')"; } }