/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.cassandra.db; import java.io.File; import java.io.IOException; import java.util.*; import java.util.concurrent.*; import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.locks.Lock; import com.google.common.base.Function; import com.google.common.collect.Iterables; import org.apache.cassandra.concurrent.Stage; import org.apache.cassandra.concurrent.StageManager; import org.apache.cassandra.config.*; import org.apache.cassandra.db.commitlog.CommitLog; import org.apache.cassandra.db.commitlog.ReplayPosition; import org.apache.cassandra.db.compaction.CompactionManager; import org.apache.cassandra.db.lifecycle.SSTableSet; import org.apache.cassandra.db.partitions.PartitionUpdate; import org.apache.cassandra.db.rows.UnfilteredRowIterator; import org.apache.cassandra.db.view.ViewManager; import org.apache.cassandra.exceptions.WriteTimeoutException; import org.apache.cassandra.index.Index; import org.apache.cassandra.index.SecondaryIndexManager; import org.apache.cassandra.index.transactions.UpdateTransaction; import org.apache.cassandra.io.sstable.format.SSTableReader; import org.apache.cassandra.locator.AbstractReplicationStrategy; import org.apache.cassandra.metrics.KeyspaceMetrics; import org.apache.cassandra.schema.KeyspaceMetadata; import org.apache.cassandra.service.StorageService; import org.apache.cassandra.tracing.Tracing; import org.apache.cassandra.utils.ByteBufferUtil; import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.JVMStabilityInspector; import org.apache.cassandra.utils.concurrent.OpOrder; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * It represents a Keyspace. */ public class Keyspace { private static final Logger logger = LoggerFactory.getLogger(Keyspace.class); private static final String TEST_FAIL_WRITES_KS = System.getProperty("cassandra.test.fail_writes_ks", ""); private static final boolean TEST_FAIL_WRITES = !TEST_FAIL_WRITES_KS.isEmpty(); public final KeyspaceMetrics metric; // It is possible to call Keyspace.open without a running daemon, so it makes sense to ensure // proper directories here as well as in CassandraDaemon. static { if (!Config.isClientMode()) DatabaseDescriptor.createAllDirectories(); } private volatile KeyspaceMetadata metadata; //OpOrder is defined globally since we need to order writes across //Keyspaces in the case of Views (batchlog of view mutations) public static final OpOrder writeOrder = new OpOrder(); /* ColumnFamilyStore per column family */ private final ConcurrentMap<UUID, ColumnFamilyStore> columnFamilyStores = new ConcurrentHashMap<>(); private volatile AbstractReplicationStrategy replicationStrategy; public final ViewManager viewManager; public static final Function<String,Keyspace> keyspaceTransformer = new Function<String, Keyspace>() { public Keyspace apply(String keyspaceName) { return Keyspace.open(keyspaceName); } }; private static volatile boolean initialized = false; public static void setInitialized() { initialized = true; } public static Keyspace open(String keyspaceName) { assert initialized || Schema.isSystemKeyspace(keyspaceName); return open(keyspaceName, Schema.instance, true); } // to only be used by org.apache.cassandra.tools.Standalone* classes public static Keyspace openWithoutSSTables(String keyspaceName) { return open(keyspaceName, Schema.instance, false); } private static Keyspace open(String keyspaceName, Schema schema, boolean loadSSTables) { Keyspace keyspaceInstance = schema.getKeyspaceInstance(keyspaceName); if (keyspaceInstance == null) { // instantiate the Keyspace. we could use putIfAbsent but it's important to making sure it is only done once // per keyspace, so we synchronize and re-check before doing it. synchronized (Keyspace.class) { keyspaceInstance = schema.getKeyspaceInstance(keyspaceName); if (keyspaceInstance == null) { // open and store the keyspace keyspaceInstance = new Keyspace(keyspaceName, loadSSTables); schema.storeKeyspaceInstance(keyspaceInstance); } } } return keyspaceInstance; } public static Keyspace clear(String keyspaceName) { return clear(keyspaceName, Schema.instance); } public static Keyspace clear(String keyspaceName, Schema schema) { synchronized (Keyspace.class) { Keyspace t = schema.removeKeyspaceInstance(keyspaceName); if (t != null) { for (ColumnFamilyStore cfs : t.getColumnFamilyStores()) t.unloadCf(cfs); t.metric.release(); } return t; } } public static ColumnFamilyStore openAndGetStore(CFMetaData cfm) { return open(cfm.ksName).getColumnFamilyStore(cfm.cfId); } /** * Removes every SSTable in the directory from the appropriate Tracker's view. * @param directory the unreadable directory, possibly with SSTables in it, but not necessarily. */ public static void removeUnreadableSSTables(File directory) { for (Keyspace keyspace : Keyspace.all()) { for (ColumnFamilyStore baseCfs : keyspace.getColumnFamilyStores()) { for (ColumnFamilyStore cfs : baseCfs.concatWithIndexes()) cfs.maybeRemoveUnreadableSSTables(directory); } } } public void setMetadata(KeyspaceMetadata metadata) { this.metadata = metadata; createReplicationStrategy(metadata); } public KeyspaceMetadata getMetadata() { return metadata; } public Collection<ColumnFamilyStore> getColumnFamilyStores() { return Collections.unmodifiableCollection(columnFamilyStores.values()); } public ColumnFamilyStore getColumnFamilyStore(String cfName) { UUID id = Schema.instance.getId(getName(), cfName); if (id == null) throw new IllegalArgumentException(String.format("Unknown keyspace/cf pair (%s.%s)", getName(), cfName)); return getColumnFamilyStore(id); } public ColumnFamilyStore getColumnFamilyStore(UUID id) { ColumnFamilyStore cfs = columnFamilyStores.get(id); if (cfs == null) throw new IllegalArgumentException("Unknown CF " + id); return cfs; } public boolean hasColumnFamilyStore(UUID id) { return columnFamilyStores.containsKey(id); } /** * Take a snapshot of the specific column family, or the entire set of column families * if columnFamily is null with a given timestamp * * @param snapshotName the tag associated with the name of the snapshot. This value may not be null * @param columnFamilyName the column family to snapshot or all on null * @throws IOException if the column family doesn't exist */ public void snapshot(String snapshotName, String columnFamilyName) throws IOException { assert snapshotName != null; boolean tookSnapShot = false; for (ColumnFamilyStore cfStore : columnFamilyStores.values()) { if (columnFamilyName == null || cfStore.name.equals(columnFamilyName)) { tookSnapShot = true; cfStore.snapshot(snapshotName); } } if ((columnFamilyName != null) && !tookSnapShot) throw new IOException("Failed taking snapshot. Table " + columnFamilyName + " does not exist."); } /** * @param clientSuppliedName may be null. * @return the name of the snapshot */ public static String getTimestampedSnapshotName(String clientSuppliedName) { String snapshotName = Long.toString(System.currentTimeMillis()); if (clientSuppliedName != null && !clientSuppliedName.equals("")) { snapshotName = snapshotName + "-" + clientSuppliedName; } return snapshotName; } /** * Check whether snapshots already exists for a given name. * * @param snapshotName the user supplied snapshot name * @return true if the snapshot exists */ public boolean snapshotExists(String snapshotName) { assert snapshotName != null; for (ColumnFamilyStore cfStore : columnFamilyStores.values()) { if (cfStore.snapshotExists(snapshotName)) return true; } return false; } /** * Clear all the snapshots for a given keyspace. * * @param snapshotName the user supplied snapshot name. It empty or null, * all the snapshots will be cleaned */ public static void clearSnapshot(String snapshotName, String keyspace) { List<File> snapshotDirs = Directories.getKSChildDirectories(keyspace, ColumnFamilyStore.getInitialDirectories()); Directories.clearSnapshot(snapshotName, snapshotDirs); } /** * @return A list of open SSTableReaders */ public List<SSTableReader> getAllSSTables(SSTableSet sstableSet) { List<SSTableReader> list = new ArrayList<>(columnFamilyStores.size()); for (ColumnFamilyStore cfStore : columnFamilyStores.values()) Iterables.addAll(list, cfStore.getSSTables(sstableSet)); return list; } private Keyspace(String keyspaceName, boolean loadSSTables) { metadata = Schema.instance.getKSMetaData(keyspaceName); assert metadata != null : "Unknown keyspace " + keyspaceName; createReplicationStrategy(metadata); this.metric = new KeyspaceMetrics(this); this.viewManager = new ViewManager(this); for (CFMetaData cfm : metadata.tablesAndViews()) { logger.trace("Initializing {}.{}", getName(), cfm.cfName); initCf(cfm, loadSSTables); } this.viewManager.reload(); } private Keyspace(KeyspaceMetadata metadata) { this.metadata = metadata; createReplicationStrategy(metadata); this.metric = new KeyspaceMetrics(this); this.viewManager = new ViewManager(this); } public static Keyspace mockKS(KeyspaceMetadata metadata) { return new Keyspace(metadata); } private void createReplicationStrategy(KeyspaceMetadata ksm) { replicationStrategy = AbstractReplicationStrategy.createReplicationStrategy(ksm.name, ksm.params.replication.klass, StorageService.instance.getTokenMetadata(), DatabaseDescriptor.getEndpointSnitch(), ksm.params.replication.options); } // best invoked on the compaction mananger. public void dropCf(UUID cfId) { assert columnFamilyStores.containsKey(cfId); ColumnFamilyStore cfs = columnFamilyStores.remove(cfId); if (cfs == null) return; cfs.getCompactionStrategyManager().shutdown(); CompactionManager.instance.interruptCompactionForCFs(cfs.concatWithIndexes(), true); // wait for any outstanding reads/writes that might affect the CFS cfs.keyspace.writeOrder.awaitNewBarrier(); cfs.readOrdering.awaitNewBarrier(); unloadCf(cfs); } // disassociate a cfs from this keyspace instance. private void unloadCf(ColumnFamilyStore cfs) { cfs.forceBlockingFlush(); cfs.invalidate(); } /** * adds a cf to internal structures, ends up creating disk files). */ public void initCf(CFMetaData metadata, boolean loadSSTables) { ColumnFamilyStore cfs = columnFamilyStores.get(metadata.cfId); if (cfs == null) { // CFS being created for the first time, either on server startup or new CF being added. // We don't worry about races here; startup is safe, and adding multiple idential CFs // simultaneously is a "don't do that" scenario. ColumnFamilyStore oldCfs = columnFamilyStores.putIfAbsent(metadata.cfId, ColumnFamilyStore.createColumnFamilyStore(this, metadata, loadSSTables)); // CFS mbean instantiation will error out before we hit this, but in case that changes... if (oldCfs != null) throw new IllegalStateException("added multiple mappings for cf id " + metadata.cfId); } else { // re-initializing an existing CF. This will happen if you cleared the schema // on this node and it's getting repopulated from the rest of the cluster. assert cfs.name.equals(metadata.cfName); cfs.reload(); } } public CompletableFuture<?> apply(Mutation mutation, boolean writeCommitLog) { return apply(mutation, writeCommitLog, true, false, null); } public CompletableFuture<?> apply(Mutation mutation, boolean writeCommitLog, boolean updateIndexes) { return apply(mutation, writeCommitLog, updateIndexes, false, null); } public CompletableFuture<?> applyFromCommitLog(Mutation mutation) { return apply(mutation, false, true, true, null); } /** * This method appends a row to the global CommitLog, then updates memtables and indexes. * * @param mutation the row to write. Must not be modified after calling apply, since commitlog append * may happen concurrently, depending on the CL Executor type. * @param writeCommitLog false to disable commitlog append entirely * @param updateIndexes false to disable index updates (used by CollationController "defragmenting") * @param isClReplay true if caller is the commitlog replayer */ public CompletableFuture<?> apply(final Mutation mutation, final boolean writeCommitLog, boolean updateIndexes, boolean isClReplay, CompletableFuture<?> future) { if (TEST_FAIL_WRITES && metadata.name.equals(TEST_FAIL_WRITES_KS)) throw new RuntimeException("Testing write failures"); Lock lock = null; boolean requiresViewUpdate = updateIndexes && viewManager.updatesAffectView(Collections.singleton(mutation), false); final CompletableFuture<?> mark = future == null ? new CompletableFuture<>() : future; if (requiresViewUpdate) { mutation.viewLockAcquireStart.compareAndSet(0L, System.currentTimeMillis()); lock = ViewManager.acquireLockFor(mutation.key().getKey()); if (lock == null) { // avoid throwing a WTE during commitlog replay if (!isClReplay && (System.currentTimeMillis() - mutation.createdAt) > DatabaseDescriptor.getWriteRpcTimeout()) { logger.trace("Could not acquire lock for {}", ByteBufferUtil.bytesToHex(mutation.key().getKey())); Tracing.trace("Could not acquire MV lock"); if (future != null) future.completeExceptionally(new WriteTimeoutException(WriteType.VIEW, ConsistencyLevel.LOCAL_ONE, 0, 1)); else throw new WriteTimeoutException(WriteType.VIEW, ConsistencyLevel.LOCAL_ONE, 0, 1); } else { //This view update can't happen right now. so rather than keep this thread busy // we will re-apply ourself to the queue and try again later StageManager.getStage(Stage.MUTATION).execute(() -> apply(mutation, writeCommitLog, true, isClReplay, mark) ); return mark; } } else { long acquireTime = System.currentTimeMillis() - mutation.viewLockAcquireStart.get(); if (!isClReplay) { for(UUID cfid : mutation.getColumnFamilyIds()) { columnFamilyStores.get(cfid).metric.viewLockAcquireTime.update(acquireTime, TimeUnit.MILLISECONDS); } } } } int nowInSec = FBUtilities.nowInSeconds(); try (OpOrder.Group opGroup = writeOrder.start()) { // write the mutation to the commitlog and memtables ReplayPosition replayPosition = null; if (writeCommitLog) { Tracing.trace("Appending to commitlog"); replayPosition = CommitLog.instance.add(mutation); } for (PartitionUpdate upd : mutation.getPartitionUpdates()) { ColumnFamilyStore cfs = columnFamilyStores.get(upd.metadata().cfId); if (cfs == null) { logger.error("Attempting to mutate non-existant table {} ({}.{})", upd.metadata().cfId, upd.metadata().ksName, upd.metadata().cfName); continue; } AtomicLong baseComplete = new AtomicLong(Long.MAX_VALUE); if (requiresViewUpdate) { try { Tracing.trace("Creating materialized view mutations from base table replica"); viewManager.forTable(upd.metadata()).pushViewReplicaUpdates(upd, !isClReplay, baseComplete); } catch (Throwable t) { JVMStabilityInspector.inspectThrowable(t); logger.error(String.format("Unknown exception caught while attempting to update MaterializedView! %s.%s", upd.metadata().ksName, upd.metadata().cfName), t); throw t; } } Tracing.trace("Adding to {} memtable", upd.metadata().cfName); UpdateTransaction indexTransaction = updateIndexes ? cfs.indexManager.newUpdateTransaction(upd, opGroup, nowInSec) : UpdateTransaction.NO_OP; cfs.apply(upd, indexTransaction, opGroup, replayPosition); if (requiresViewUpdate) baseComplete.set(System.currentTimeMillis()); } mark.complete(null); return mark; } finally { if (lock != null) lock.unlock(); } } public AbstractReplicationStrategy getReplicationStrategy() { return replicationStrategy; } /** * @param key row to index * @param cfs ColumnFamily to index partition in * @param indexes the indexes to submit the row to */ public static void indexPartition(DecoratedKey key, ColumnFamilyStore cfs, Set<Index> indexes) { if (logger.isTraceEnabled()) logger.trace("Indexing partition {} ", cfs.metadata.getKeyValidator().getString(key.getKey())); SinglePartitionReadCommand cmd = SinglePartitionReadCommand.fullPartitionRead(cfs.metadata, FBUtilities.nowInSeconds(), key); try (OpOrder.Group writeGroup = cfs.keyspace.writeOrder.start(); OpOrder.Group readGroup = cfs.readOrdering.start(); UnfilteredRowIterator partition = cmd.queryMemtableAndDisk(cfs, readGroup)) { cfs.indexManager.indexPartition(partition, writeGroup, indexes, cmd.nowInSec()); } } public List<Future<?>> flush() { List<Future<?>> futures = new ArrayList<>(columnFamilyStores.size()); for (ColumnFamilyStore cfs : columnFamilyStores.values()) futures.add(cfs.forceFlush()); return futures; } public Iterable<ColumnFamilyStore> getValidColumnFamilies(boolean allowIndexes, boolean autoAddIndexes, String... cfNames) throws IOException { Set<ColumnFamilyStore> valid = new HashSet<>(); if (cfNames.length == 0) { // all stores are interesting for (ColumnFamilyStore cfStore : getColumnFamilyStores()) { valid.add(cfStore); if (autoAddIndexes) valid.addAll(getIndexColumnFamilyStores(cfStore)); } return valid; } // include the specified stores and possibly the stores of any of their indexes for (String cfName : cfNames) { if (SecondaryIndexManager.isIndexColumnFamily(cfName)) { if (!allowIndexes) { logger.warn("Operation not allowed on secondary Index table ({})", cfName); continue; } String baseName = SecondaryIndexManager.getParentCfsName(cfName); String indexName = SecondaryIndexManager.getIndexName(cfName); ColumnFamilyStore baseCfs = getColumnFamilyStore(baseName); Index index = baseCfs.indexManager.getIndexByName(indexName); if (index == null) throw new IllegalArgumentException(String.format("Invalid index specified: %s/%s.", baseCfs.metadata.cfName, indexName)); if (index.getBackingTable().isPresent()) valid.add(index.getBackingTable().get()); } else { ColumnFamilyStore cfStore = getColumnFamilyStore(cfName); valid.add(cfStore); if (autoAddIndexes) valid.addAll(getIndexColumnFamilyStores(cfStore)); } } return valid; } private Set<ColumnFamilyStore> getIndexColumnFamilyStores(ColumnFamilyStore baseCfs) { Set<ColumnFamilyStore> stores = new HashSet<>(); for (ColumnFamilyStore indexCfs : baseCfs.indexManager.getAllIndexColumnFamilyStores()) { logger.info("adding secondary index table {} to operation", indexCfs.metadata.cfName); stores.add(indexCfs); } return stores; } public static Iterable<Keyspace> all() { return Iterables.transform(Schema.instance.getKeyspaces(), keyspaceTransformer); } public static Iterable<Keyspace> nonSystem() { return Iterables.transform(Schema.instance.getNonSystemKeyspaces(), keyspaceTransformer); } public static Iterable<Keyspace> nonLocalStrategy() { return Iterables.transform(Schema.instance.getNonLocalStrategyKeyspaces(), keyspaceTransformer); } public static Iterable<Keyspace> system() { return Iterables.transform(Schema.SYSTEM_KEYSPACE_NAMES, keyspaceTransformer); } @Override public String toString() { return getClass().getSimpleName() + "(name='" + getName() + "')"; } public String getName() { return metadata.name; } }