/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.cassandra.db.view; import java.util.*; import java.util.concurrent.CopyOnWriteArrayList; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicLong; import com.google.common.collect.Iterables; import com.google.common.collect.Iterators; import com.google.common.collect.PeekingIterator; import org.apache.cassandra.config.CFMetaData; import org.apache.cassandra.db.*; import org.apache.cassandra.db.commitlog.ReplayPosition; import org.apache.cassandra.db.filter.*; import org.apache.cassandra.db.rows.*; import org.apache.cassandra.db.partitions.*; import org.apache.cassandra.dht.Token; import org.apache.cassandra.service.StorageProxy; import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.btree.BTreeSet; /** * Groups all the views for a given table. */ public class TableViews extends AbstractCollection<View> { private final CFMetaData baseTableMetadata; // We need this to be thread-safe, but the number of times this is changed (when a view is created in the keyspace) // massively exceeds the number of time it's read (for every mutation on the keyspace), so a copy-on-write list is the best option. private final List<View> views = new CopyOnWriteArrayList(); public TableViews(CFMetaData baseTableMetadata) { this.baseTableMetadata = baseTableMetadata; } public int size() { return views.size(); } public Iterator<View> iterator() { return views.iterator(); } public boolean contains(String viewName) { return Iterables.any(views, view -> view.name.equals(viewName)); } public boolean add(View view) { // We should have validated that there is no existing view with this name at this point assert !contains(view.name); return views.add(view); } public Iterable<ColumnFamilyStore> allViewsCfs() { Keyspace keyspace = Keyspace.open(baseTableMetadata.ksName); return Iterables.transform(views, view -> keyspace.getColumnFamilyStore(view.getDefinition().viewName)); } public void forceBlockingFlush() { for (ColumnFamilyStore viewCfs : allViewsCfs()) viewCfs.forceBlockingFlush(); } public void dumpMemtables() { for (ColumnFamilyStore viewCfs : allViewsCfs()) viewCfs.dumpMemtable(); } public void truncateBlocking(ReplayPosition replayAfter, long truncatedAt) { for (ColumnFamilyStore viewCfs : allViewsCfs()) { viewCfs.discardSSTables(truncatedAt); SystemKeyspace.saveTruncationRecord(viewCfs, truncatedAt, replayAfter); } } public void removeByName(String viewName) { views.removeIf(v -> v.name.equals(viewName)); } /** * Calculates and pushes updates to the views replicas. The replicas are determined by * {@link ViewUtils#getViewNaturalEndpoint(String, Token, Token)}. * * @param update an update on the base table represented by this object. * @param writeCommitLog whether we should write the commit log for the view updates. * @param baseComplete time from epoch in ms that the local base mutation was (or will be) completed */ public void pushViewReplicaUpdates(PartitionUpdate update, boolean writeCommitLog, AtomicLong baseComplete) { assert update.metadata().cfId.equals(baseTableMetadata.cfId); Collection<View> views = updatedViews(update); if (views.isEmpty()) return; // Read modified rows int nowInSec = FBUtilities.nowInSeconds(); SinglePartitionReadCommand command = readExistingRowsCommand(update, views, nowInSec); if (command == null) return; ColumnFamilyStore cfs = Keyspace.openAndGetStore(update.metadata()); long start = System.nanoTime(); Collection<Mutation> mutations; try (ReadOrderGroup orderGroup = command.startOrderGroup(); UnfilteredRowIterator existings = UnfilteredPartitionIterators.getOnlyElement(command.executeLocally(orderGroup), command); UnfilteredRowIterator updates = update.unfilteredIterator()) { mutations = generateViewUpdates(views, updates, existings, nowInSec); } Keyspace.openAndGetStore(update.metadata()).metric.viewReadTime.update(System.nanoTime() - start, TimeUnit.NANOSECONDS); if (!mutations.isEmpty()) StorageProxy.mutateMV(update.partitionKey().getKey(), mutations, writeCommitLog, baseComplete); } /** * Given some updates on the base table of this object and the existing values for the rows affected by that update, generates the * mutation to be applied to the provided views. * * @param views the views potentially affected by {@code updates}. * @param updates the base table updates being applied. * @param existings the existing values for the rows affected by {@code updates}. This is used to decide if a view is * obsoleted by the update and should be removed, gather the values for columns that may not be part of the update if * a new view entry needs to be created, and compute the minimal updates to be applied if the view entry isn't changed * but has simply some updated values. This will be empty for view building as we want to assume anything we'll pass * to {@code updates} is new. * @param nowInSec the current time in seconds. * @return the mutations to apply to the {@code views}. This can be empty. */ public Collection<Mutation> generateViewUpdates(Collection<View> views, UnfilteredRowIterator updates, UnfilteredRowIterator existings, int nowInSec) { assert updates.metadata().cfId.equals(baseTableMetadata.cfId); List<ViewUpdateGenerator> generators = new ArrayList<>(views.size()); for (View view : views) generators.add(new ViewUpdateGenerator(view, updates.partitionKey(), nowInSec)); DeletionTracker existingsDeletion = new DeletionTracker(existings.partitionLevelDeletion()); DeletionTracker updatesDeletion = new DeletionTracker(updates.partitionLevelDeletion()); /* * We iterate through the updates and the existing rows in parallel. This allows us to know the consequence * on the view of each update. */ PeekingIterator<Unfiltered> existingsIter = Iterators.peekingIterator(existings); PeekingIterator<Unfiltered> updatesIter = Iterators.peekingIterator(updates); while (existingsIter.hasNext() && updatesIter.hasNext()) { Unfiltered existing = existingsIter.peek(); Unfiltered update = updatesIter.peek(); Row existingRow; Row updateRow; int cmp = baseTableMetadata.comparator.compare(update, existing); if (cmp < 0) { // We have an update where there was nothing before if (update.isRangeTombstoneMarker()) { updatesDeletion.update(updatesIter.next()); continue; } updateRow = ((Row)updatesIter.next()).withRowDeletion(updatesDeletion.currentDeletion()); existingRow = emptyRow(updateRow.clustering(), existingsDeletion.currentDeletion()); } else if (cmp > 0) { // We have something existing but no update (which will happen either because it's a range tombstone marker in // existing, or because we've fetched the existing row due to some partition/range deletion in the updates) if (existing.isRangeTombstoneMarker()) { existingsDeletion.update(existingsIter.next()); continue; } existingRow = ((Row)existingsIter.next()).withRowDeletion(existingsDeletion.currentDeletion()); updateRow = emptyRow(existingRow.clustering(), updatesDeletion.currentDeletion()); // The way we build the read command used for existing rows, we should always have updatesDeletion.currentDeletion() // that is not live, since we wouldn't have read the existing row otherwise. And we could assert that, but if we ever // change the read method so that it can slightly over-read in some case, that would be an easily avoiding bug lurking, // so we just handle the case. if (updateRow == null) continue; } else { // We're updating a row that had pre-existing data if (update.isRangeTombstoneMarker()) { assert existing.isRangeTombstoneMarker(); updatesDeletion.update(updatesIter.next()); existingsDeletion.update(existingsIter.next()); continue; } assert !existing.isRangeTombstoneMarker(); existingRow = ((Row)existingsIter.next()).withRowDeletion(existingsDeletion.currentDeletion()); updateRow = ((Row)updatesIter.next()).withRowDeletion(updatesDeletion.currentDeletion()); } addToViewUpdateGenerators(existingRow, updateRow, generators, nowInSec); } // We only care about more existing rows if the update deletion isn't live, i.e. if we had a partition deletion if (!updatesDeletion.currentDeletion().isLive()) { while (existingsIter.hasNext()) { Unfiltered existing = existingsIter.next(); // If it's a range tombstone, we don't care, we're only looking for existing entry that gets deleted by // the new partition deletion if (existing.isRangeTombstoneMarker()) continue; Row existingRow = (Row)existing; addToViewUpdateGenerators(existingRow, emptyRow(existingRow.clustering(), updatesDeletion.currentDeletion()), generators, nowInSec); } } while (updatesIter.hasNext()) { Unfiltered update = updatesIter.next(); // If it's a range tombstone, it removes nothing pre-exisiting, so we can ignore it for view updates if (update.isRangeTombstoneMarker()) continue; Row updateRow = (Row)update; addToViewUpdateGenerators(emptyRow(updateRow.clustering(), DeletionTime.LIVE), updateRow, generators, nowInSec); } return buildMutations(baseTableMetadata, generators); } /** * Return the views that are potentially updated by the provided updates. * * @param updates the updates applied to the base table. * @return the views affected by {@code updates}. */ public Collection<View> updatedViews(PartitionUpdate updates) { List<View> matchingViews = new ArrayList<>(views.size()); for (View view : views) { ReadQuery selectQuery = view.getReadQuery(); if (!selectQuery.selectsKey(updates.partitionKey())) continue; matchingViews.add(view); } return matchingViews; } /** * Returns the command to use to read the existing rows required to generate view updates for the provided base * base updates. * * @param updates the base table updates being applied. * @param views the views potentially affected by {@code updates}. * @param nowInSec the current time in seconds. * @return the command to use to read the base table rows required to generate view updates for {@code updates}. */ private SinglePartitionReadCommand readExistingRowsCommand(PartitionUpdate updates, Collection<View> views, int nowInSec) { Slices.Builder sliceBuilder = null; DeletionInfo deletionInfo = updates.deletionInfo(); CFMetaData metadata = updates.metadata(); DecoratedKey key = updates.partitionKey(); // TODO: This is subtle: we need to gather all the slices that we have to fetch between partition del, range tombstones and rows. if (!deletionInfo.isLive()) { sliceBuilder = new Slices.Builder(metadata.comparator); // Everything covered by a deletion might invalidate an existing view entry, which means we must read it to know. In practice // though, the views involved might filter some base table clustering columns, in which case we can restrict what we read // using those restrictions. // If there is a partition deletion, then we can simply take each slices from each view select filter. They may overlap but // the Slices.Builder handles that for us. Note that in many case this will just involve reading everything (as soon as any // view involved has no clustering restrictions for instance). // For range tombstone, we should theoretically take the difference between the range tombstoned and the slices selected // by every views, but as we don't an easy way to compute that right now, we keep it simple and just use the tombstoned // range. // TODO: we should improve that latter part. if (!deletionInfo.getPartitionDeletion().isLive()) { for (View view : views) sliceBuilder.addAll(view.getSelectStatement().clusteringIndexFilterAsSlices()); } else { assert deletionInfo.hasRanges(); Iterator<RangeTombstone> iter = deletionInfo.rangeIterator(false); while (iter.hasNext()) sliceBuilder.add(iter.next().deletedSlice()); } } // We need to read every row that is updated, unless we can prove that it has no impact on any view entries. // If we had some slices from the deletions above, we'll continue using that. Otherwise, it's more efficient to build // a names query. BTreeSet.Builder<Clustering> namesBuilder = sliceBuilder == null ? BTreeSet.builder(metadata.comparator) : null; for (Row row : updates) { // Don't read the existing state if we can prove the update won't affect any views if (!affectsAnyViews(key, row, views)) continue; if (namesBuilder == null) sliceBuilder.add(Slice.make(row.clustering())); else namesBuilder.add(row.clustering()); } NavigableSet<Clustering> names = namesBuilder == null ? null : namesBuilder.build(); // If we have a slice builder, it means we had some deletions and we have to read. But if we had // only row updates, it's possible none of them affected the views, in which case we have nothing // to do. if (names != null && names.isEmpty()) return null; ClusteringIndexFilter clusteringFilter = names == null ? new ClusteringIndexSliceFilter(sliceBuilder.build(), false) : new ClusteringIndexNamesFilter(names, false); // If we have more than one view, we should merge the queried columns by each views but to keep it simple we just // include everything. We could change that in the future. ColumnFilter queriedColumns = views.size() == 1 ? Iterables.getOnlyElement(views).getSelectStatement().queriedColumns() : ColumnFilter.all(metadata); // Note that the views could have restrictions on regular columns, but even if that's the case we shouldn't apply those // when we read, because even if an existing row doesn't match the view filter, the update can change that in which // case we'll need to know the existing content. There is also no easy way to merge those RowFilter when we have multiple views. // TODO: we could still make sense to special case for when there is a single view and a small number of updates (and // no deletions). Indeed, in that case we could check whether any of the update modify any of the restricted regular // column, and if that's not the case we could use view filter. We keep it simple for now though. RowFilter rowFilter = RowFilter.NONE; return SinglePartitionReadCommand.create(metadata, nowInSec, queriedColumns, rowFilter, DataLimits.NONE, key, clusteringFilter); } private boolean affectsAnyViews(DecoratedKey partitionKey, Row update, Collection<View> views) { for (View view : views) { if (view.mayBeAffectedBy(partitionKey, update)) return true; } return false; } /** * Given an existing base row and the update that we're going to apply to this row, generate the modifications * to apply to MVs using the provided {@code ViewUpdateGenerator}s. * * @param existingBaseRow the base table row as it is before an update. * @param updateBaseRow the newly updates made to {@code existingBaseRow}. * @param generators the view update generators to add the new changes to. * @param nowInSec the current time in seconds. Used to decide if data is live or not. */ private static void addToViewUpdateGenerators(Row existingBaseRow, Row updateBaseRow, Collection<ViewUpdateGenerator> generators, int nowInSec) { // Having existing empty is useful, it just means we'll insert a brand new entry for updateBaseRow, // but if we have no update at all, we shouldn't get there. assert !updateBaseRow.isEmpty(); // We allow existingBaseRow to be null, which we treat the same as being empty as an small optimization // to avoid allocating empty row objects when we know there was nothing existing. Row mergedBaseRow = existingBaseRow == null ? updateBaseRow : Rows.merge(existingBaseRow, updateBaseRow, nowInSec); for (ViewUpdateGenerator generator : generators) generator.addBaseTableUpdate(existingBaseRow, mergedBaseRow); } private static Row emptyRow(Clustering clustering, DeletionTime deletion) { // Returning null for an empty row is slightly ugly, but the case where there is no pre-existing row is fairly common // (especially when building the view), so we want to avoid a dummy allocation of an empty row every time. // And MultiViewUpdateBuilder knows how to deal with that. return deletion.isLive() ? null : BTreeRow.emptyDeletedRow(clustering, Row.Deletion.regular(deletion)); } /** * Extracts (and potentially groups) the mutations generated by the provided view update generator. * Returns the mutation that needs to be done to the views given the base table updates * passed to {@link #addBaseTableUpdate}. * * @param baseTableMetadata the metadata for the base table being updated. * @param generators the generators from which to extract the view mutations from. * @return the mutations created by all the generators in {@code generators}. */ private Collection<Mutation> buildMutations(CFMetaData baseTableMetadata, List<ViewUpdateGenerator> generators) { // One view is probably common enough and we can optimize a bit easily if (generators.size() == 1) { Collection<PartitionUpdate> updates = generators.get(0).generateViewUpdates(); List<Mutation> mutations = new ArrayList<>(updates.size()); for (PartitionUpdate update : updates) mutations.add(new Mutation(update)); return mutations; } Map<DecoratedKey, Mutation> mutations = new HashMap<>(); for (ViewUpdateGenerator generator : generators) { for (PartitionUpdate update : generator.generateViewUpdates()) { DecoratedKey key = update.partitionKey(); Mutation mutation = mutations.get(key); if (mutation == null) { mutation = new Mutation(baseTableMetadata.ksName, key); mutations.put(key, mutation); } mutation.add(update); } } return mutations.values(); } /** * A simple helper that tracks for a given {@code UnfilteredRowIterator} what is the current deletion at any time of the * iteration. It will be the currently open range tombstone deletion if there is one and the partition deletion otherwise. */ private static class DeletionTracker { private final DeletionTime partitionDeletion; private DeletionTime deletion; public DeletionTracker(DeletionTime partitionDeletion) { this.partitionDeletion = partitionDeletion; } public void update(Unfiltered marker) { assert marker instanceof RangeTombstoneMarker; RangeTombstoneMarker rtm = (RangeTombstoneMarker)marker; this.deletion = rtm.isOpen(false) ? rtm.openDeletionTime(false) : null; } public DeletionTime currentDeletion() { return deletion == null ? partitionDeletion : deletion; } } }