TableViews.java example

Explorer
scylla-tools-java-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.cassandra.db.view;

import java.util.*;
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;

import com.google.common.collect.Iterables;
import com.google.common.collect.Iterators;
import com.google.common.collect.PeekingIterator;

import org.apache.cassandra.config.CFMetaData;
import org.apache.cassandra.db.*;
import org.apache.cassandra.db.commitlog.ReplayPosition;
import org.apache.cassandra.db.filter.*;
import org.apache.cassandra.db.rows.*;
import org.apache.cassandra.db.partitions.*;
import org.apache.cassandra.dht.Token;
import org.apache.cassandra.service.StorageProxy;
import org.apache.cassandra.utils.FBUtilities;
import org.apache.cassandra.utils.btree.BTreeSet;


/**
 * Groups all the views for a given table.
 */
public class TableViews extends AbstractCollection<View>
{
    private final CFMetaData baseTableMetadata;

    // We need this to be thread-safe, but the number of times this is changed (when a view is created in the keyspace)
    // massively exceeds the number of time it's read (for every mutation on the keyspace), so a copy-on-write list is the best option.
    private final List<View> views = new CopyOnWriteArrayList();

    public TableViews(CFMetaData baseTableMetadata)
    {
        this.baseTableMetadata = baseTableMetadata;
    }

    public int size()
    {
        return views.size();
    }

    public Iterator<View> iterator()
    {
        return views.iterator();
    }

    public boolean contains(String viewName)
    {
        return Iterables.any(views, view -> view.name.equals(viewName));
    }

    public boolean add(View view)
    {
        // We should have validated that there is no existing view with this name at this point
        assert !contains(view.name);
        return views.add(view);
    }

    public Iterable<ColumnFamilyStore> allViewsCfs()
    {
        Keyspace keyspace = Keyspace.open(baseTableMetadata.ksName);
        return Iterables.transform(views, view -> keyspace.getColumnFamilyStore(view.getDefinition().viewName));
    }

    public void forceBlockingFlush()
    {
        for (ColumnFamilyStore viewCfs : allViewsCfs())
            viewCfs.forceBlockingFlush();
    }

    public void dumpMemtables()
    {
        for (ColumnFamilyStore viewCfs : allViewsCfs())
            viewCfs.dumpMemtable();
    }

    public void truncateBlocking(ReplayPosition replayAfter, long truncatedAt)
    {
        for (ColumnFamilyStore viewCfs : allViewsCfs())
        {
            viewCfs.discardSSTables(truncatedAt);
            SystemKeyspace.saveTruncationRecord(viewCfs, truncatedAt, replayAfter);
        }
    }

    public void removeByName(String viewName)
    {
        views.removeIf(v -> v.name.equals(viewName));
    }

    /**
     * Calculates and pushes updates to the views replicas. The replicas are determined by
     * {@link ViewUtils#getViewNaturalEndpoint(String, Token, Token)}.
     *
     * @param update an update on the base table represented by this object.
     * @param writeCommitLog whether we should write the commit log for the view updates.
     * @param baseComplete time from epoch in ms that the local base mutation was (or will be) completed
     */
    public void pushViewReplicaUpdates(PartitionUpdate update, boolean writeCommitLog, AtomicLong baseComplete)
    {
        assert update.metadata().cfId.equals(baseTableMetadata.cfId);

        Collection<View> views = updatedViews(update);
        if (views.isEmpty())
            return;

        // Read modified rows
        int nowInSec = FBUtilities.nowInSeconds();
        SinglePartitionReadCommand command = readExistingRowsCommand(update, views, nowInSec);
        if (command == null)
            return;

        ColumnFamilyStore cfs = Keyspace.openAndGetStore(update.metadata());
        long start = System.nanoTime();
        Collection<Mutation> mutations;
        try (ReadOrderGroup orderGroup = command.startOrderGroup();
             UnfilteredRowIterator existings = UnfilteredPartitionIterators.getOnlyElement(command.executeLocally(orderGroup), command);
             UnfilteredRowIterator updates = update.unfilteredIterator())
        {
            mutations = generateViewUpdates(views, updates, existings, nowInSec);
        }
        Keyspace.openAndGetStore(update.metadata()).metric.viewReadTime.update(System.nanoTime() - start, TimeUnit.NANOSECONDS);

        if (!mutations.isEmpty())
            StorageProxy.mutateMV(update.partitionKey().getKey(), mutations, writeCommitLog, baseComplete);
    }

    /**
     * Given some updates on the base table of this object and the existing values for the rows affected by that update, generates the
     * mutation to be applied to the provided views.
     *
     * @param views the views potentially affected by {@code updates}.
     * @param updates the base table updates being applied.
     * @param existings the existing values for the rows affected by {@code updates}. This is used to decide if a view is
     * obsoleted by the update and should be removed, gather the values for columns that may not be part of the update if
     * a new view entry needs to be created, and compute the minimal updates to be applied if the view entry isn't changed
     * but has simply some updated values. This will be empty for view building as we want to assume anything we'll pass
     * to {@code updates} is new.
     * @param nowInSec the current time in seconds.
     * @return the mutations to apply to the {@code views}. This can be empty.
     */
    public Collection<Mutation> generateViewUpdates(Collection<View> views, UnfilteredRowIterator updates, UnfilteredRowIterator existings, int nowInSec)
    {
        assert updates.metadata().cfId.equals(baseTableMetadata.cfId);

        List<ViewUpdateGenerator> generators = new ArrayList<>(views.size());
        for (View view : views)
            generators.add(new ViewUpdateGenerator(view, updates.partitionKey(), nowInSec));

        DeletionTracker existingsDeletion = new DeletionTracker(existings.partitionLevelDeletion());
        DeletionTracker updatesDeletion = new DeletionTracker(updates.partitionLevelDeletion());

        /*
         * We iterate through the updates and the existing rows in parallel. This allows us to know the consequence
         * on the view of each update.
         */
        PeekingIterator<Unfiltered> existingsIter = Iterators.peekingIterator(existings);
        PeekingIterator<Unfiltered> updatesIter = Iterators.peekingIterator(updates);

        while (existingsIter.hasNext() && updatesIter.hasNext())
        {
            Unfiltered existing = existingsIter.peek();
            Unfiltered update = updatesIter.peek();

            Row existingRow;
            Row updateRow;
            int cmp = baseTableMetadata.comparator.compare(update, existing);
            if (cmp < 0)
            {
                // We have an update where there was nothing before
                if (update.isRangeTombstoneMarker())
                {
                    updatesDeletion.update(updatesIter.next());
                    continue;
                }

                updateRow = ((Row)updatesIter.next()).withRowDeletion(updatesDeletion.currentDeletion());
                existingRow = emptyRow(updateRow.clustering(), existingsDeletion.currentDeletion());
            }
            else if (cmp > 0)
            {
                // We have something existing but no update (which will happen either because it's a range tombstone marker in
                // existing, or because we've fetched the existing row due to some partition/range deletion in the updates)
                if (existing.isRangeTombstoneMarker())
                {
                    existingsDeletion.update(existingsIter.next());
                    continue;
                }

                existingRow = ((Row)existingsIter.next()).withRowDeletion(existingsDeletion.currentDeletion());
                updateRow = emptyRow(existingRow.clustering(), updatesDeletion.currentDeletion());

                // The way we build the read command used for existing rows, we should always have updatesDeletion.currentDeletion()
                // that is not live, since we wouldn't have read the existing row otherwise. And we could assert that, but if we ever
                // change the read method so that it can slightly over-read in some case, that would be an easily avoiding bug lurking,
                // so we just handle the case.
                if (updateRow == null)
                    continue;
            }
            else
            {
                // We're updating a row that had pre-existing data
                if (update.isRangeTombstoneMarker())
                {
                    assert existing.isRangeTombstoneMarker();
                    updatesDeletion.update(updatesIter.next());
                    existingsDeletion.update(existingsIter.next());
                    continue;
                }

                assert !existing.isRangeTombstoneMarker();
                existingRow = ((Row)existingsIter.next()).withRowDeletion(existingsDeletion.currentDeletion());
                updateRow = ((Row)updatesIter.next()).withRowDeletion(updatesDeletion.currentDeletion());
            }

            addToViewUpdateGenerators(existingRow, updateRow, generators, nowInSec);
        }

        // We only care about more existing rows if the update deletion isn't live, i.e. if we had a partition deletion
        if (!updatesDeletion.currentDeletion().isLive())
        {
            while (existingsIter.hasNext())
            {
                Unfiltered existing = existingsIter.next();
                // If it's a range tombstone, we don't care, we're only looking for existing entry that gets deleted by
                // the new partition deletion
                if (existing.isRangeTombstoneMarker())
                    continue;

                Row existingRow = (Row)existing;
                addToViewUpdateGenerators(existingRow, emptyRow(existingRow.clustering(), updatesDeletion.currentDeletion()), generators, nowInSec);
            }
        }
        while (updatesIter.hasNext())
        {
            Unfiltered update = updatesIter.next();
            // If it's a range tombstone, it removes nothing pre-exisiting, so we can ignore it for view updates
            if (update.isRangeTombstoneMarker())
                continue;

            Row updateRow = (Row)update;
            addToViewUpdateGenerators(emptyRow(updateRow.clustering(), DeletionTime.LIVE), updateRow, generators, nowInSec);
        }

        return buildMutations(baseTableMetadata, generators);
    }

    /**
     * Return the views that are potentially updated by the provided updates.
     *
     * @param updates the updates applied to the base table.
     * @return the views affected by {@code updates}.
     */
    public Collection<View> updatedViews(PartitionUpdate updates)
    {
        List<View> matchingViews = new ArrayList<>(views.size());

        for (View view : views)
        {
            ReadQuery selectQuery = view.getReadQuery();
            if (!selectQuery.selectsKey(updates.partitionKey()))
                continue;

            matchingViews.add(view);
        }
        return matchingViews;
    }

    /**
     * Returns the command to use to read the existing rows required to generate view updates for the provided base
     * base updates.
     *
     * @param updates the base table updates being applied.
     * @param views the views potentially affected by {@code updates}.
     * @param nowInSec the current time in seconds.
     * @return the command to use to read the base table rows required to generate view updates for {@code updates}.
     */
    private SinglePartitionReadCommand readExistingRowsCommand(PartitionUpdate updates, Collection<View> views, int nowInSec)
    {
        Slices.Builder sliceBuilder = null;
        DeletionInfo deletionInfo = updates.deletionInfo();
        CFMetaData metadata = updates.metadata();
        DecoratedKey key = updates.partitionKey();
        // TODO: This is subtle: we need to gather all the slices that we have to fetch between partition del, range tombstones and rows.
        if (!deletionInfo.isLive())
        {
            sliceBuilder = new Slices.Builder(metadata.comparator);
            // Everything covered by a deletion might invalidate an existing view entry, which means we must read it to know. In practice
            // though, the views involved might filter some base table clustering columns, in which case we can restrict what we read
            // using those restrictions.
            // If there is a partition deletion, then we can simply take each slices from each view select filter. They may overlap but
            // the Slices.Builder handles that for us. Note that in many case this will just involve reading everything (as soon as any
            // view involved has no clustering restrictions for instance).
            // For range tombstone, we should theoretically take the difference between the range tombstoned and the slices selected
            // by every views, but as we don't an easy way to compute that right now, we keep it simple and just use the tombstoned
            // range.
            // TODO: we should improve that latter part.
            if (!deletionInfo.getPartitionDeletion().isLive())
            {
                for (View view : views)
                    sliceBuilder.addAll(view.getSelectStatement().clusteringIndexFilterAsSlices());
            }
            else
            {
                assert deletionInfo.hasRanges();
                Iterator<RangeTombstone> iter = deletionInfo.rangeIterator(false);
                while (iter.hasNext())
                    sliceBuilder.add(iter.next().deletedSlice());
            }
        }

        // We need to read every row that is updated, unless we can prove that it has no impact on any view entries.

        // If we had some slices from the deletions above, we'll continue using that. Otherwise, it's more efficient to build
        // a names query.
        BTreeSet.Builder<Clustering> namesBuilder = sliceBuilder == null ? BTreeSet.builder(metadata.comparator) : null;
        for (Row row : updates)
        {
            // Don't read the existing state if we can prove the update won't affect any views
            if (!affectsAnyViews(key, row, views))
                continue;

            if (namesBuilder == null)
                sliceBuilder.add(Slice.make(row.clustering()));
            else
                namesBuilder.add(row.clustering());
        }

        NavigableSet<Clustering> names = namesBuilder == null ? null : namesBuilder.build();
        // If we have a slice builder, it means we had some deletions and we have to read. But if we had
        // only row updates, it's possible none of them affected the views, in which case we have nothing
        // to do.
        if (names != null && names.isEmpty())
            return null;

        ClusteringIndexFilter clusteringFilter = names == null
                                               ? new ClusteringIndexSliceFilter(sliceBuilder.build(), false)
                                               : new ClusteringIndexNamesFilter(names, false);
        // If we have more than one view, we should merge the queried columns by each views but to keep it simple we just
        // include everything. We could change that in the future.
        ColumnFilter queriedColumns = views.size() == 1
                                    ? Iterables.getOnlyElement(views).getSelectStatement().queriedColumns()
                                    : ColumnFilter.all(metadata);
        // Note that the views could have restrictions on regular columns, but even if that's the case we shouldn't apply those
        // when we read, because even if an existing row doesn't match the view filter, the update can change that in which
        // case we'll need to know the existing content. There is also no easy way to merge those RowFilter when we have multiple views.
        // TODO: we could still make sense to special case for when there is a single view and a small number of updates (and
        // no deletions). Indeed, in that case we could check whether any of the update modify any of the restricted regular
        // column, and if that's not the case we could use view filter. We keep it simple for now though.
        RowFilter rowFilter = RowFilter.NONE;
        return SinglePartitionReadCommand.create(metadata, nowInSec, queriedColumns, rowFilter, DataLimits.NONE, key, clusteringFilter);
    }

    private boolean affectsAnyViews(DecoratedKey partitionKey, Row update, Collection<View> views)
    {
        for (View view : views)
        {
            if (view.mayBeAffectedBy(partitionKey, update))
                return true;
        }
        return false;
    }

    /**
     * Given an existing base row and the update that we're going to apply to this row, generate the modifications
     * to apply to MVs using the provided {@code ViewUpdateGenerator}s.
     *
     * @param existingBaseRow the base table row as it is before an update.
     * @param updateBaseRow the newly updates made to {@code existingBaseRow}.
     * @param generators the view update generators to add the new changes to.
     * @param nowInSec the current time in seconds. Used to decide if data is live or not.
     */
    private static void addToViewUpdateGenerators(Row existingBaseRow, Row updateBaseRow, Collection<ViewUpdateGenerator> generators, int nowInSec)
    {
        // Having existing empty is useful, it just means we'll insert a brand new entry for updateBaseRow,
        // but if we have no update at all, we shouldn't get there.
        assert !updateBaseRow.isEmpty();

        // We allow existingBaseRow to be null, which we treat the same as being empty as an small optimization
        // to avoid allocating empty row objects when we know there was nothing existing.
        Row mergedBaseRow = existingBaseRow == null ? updateBaseRow : Rows.merge(existingBaseRow, updateBaseRow, nowInSec);
        for (ViewUpdateGenerator generator : generators)
            generator.addBaseTableUpdate(existingBaseRow, mergedBaseRow);
    }

    private static Row emptyRow(Clustering clustering, DeletionTime deletion)
    {
        // Returning null for an empty row is slightly ugly, but the case where there is no pre-existing row is fairly common
        // (especially when building the view), so we want to avoid a dummy allocation of an empty row every time.
        // And MultiViewUpdateBuilder knows how to deal with that.
        return deletion.isLive() ? null : BTreeRow.emptyDeletedRow(clustering, Row.Deletion.regular(deletion));
    }

    /**
     * Extracts (and potentially groups) the mutations generated by the provided view update generator.
     * Returns the mutation that needs to be done to the views given the base table updates
     * passed to {@link #addBaseTableUpdate}.
     *
     * @param baseTableMetadata the metadata for the base table being updated.
     * @param generators the generators from which to extract the view mutations from.
     * @return the mutations created by all the generators in {@code generators}.
     */
    private Collection<Mutation> buildMutations(CFMetaData baseTableMetadata, List<ViewUpdateGenerator> generators)
    {
        // One view is probably common enough and we can optimize a bit easily
        if (generators.size() == 1)
        {
            Collection<PartitionUpdate> updates = generators.get(0).generateViewUpdates();
            List<Mutation> mutations = new ArrayList<>(updates.size());
            for (PartitionUpdate update : updates)
                mutations.add(new Mutation(update));
            return mutations;
        }

        Map<DecoratedKey, Mutation> mutations = new HashMap<>();
        for (ViewUpdateGenerator generator : generators)
        {
            for (PartitionUpdate update : generator.generateViewUpdates())
            {
                DecoratedKey key = update.partitionKey();
                Mutation mutation = mutations.get(key);
                if (mutation == null)
                {
                    mutation = new Mutation(baseTableMetadata.ksName, key);
                    mutations.put(key, mutation);
                }
                mutation.add(update);
            }
        }
        return mutations.values();
    }

    /**
     * A simple helper that tracks for a given {@code UnfilteredRowIterator} what is the current deletion at any time of the
     * iteration. It will be the currently open range tombstone deletion if there is one and the partition deletion otherwise.
     */
    private static class DeletionTracker
    {
        private final DeletionTime partitionDeletion;
        private DeletionTime deletion;

        public DeletionTracker(DeletionTime partitionDeletion)
        {
            this.partitionDeletion = partitionDeletion;
        }

        public void update(Unfiltered marker)
        {
            assert marker instanceof RangeTombstoneMarker;
            RangeTombstoneMarker rtm = (RangeTombstoneMarker)marker;
            this.deletion = rtm.isOpen(false)
                          ? rtm.openDeletionTime(false)
                          : null;
        }

        public DeletionTime currentDeletion()
        {
            return deletion == null ? partitionDeletion : deletion;
        }
    }
}