CompositesSearcher.java example

Explorer
cassa-master
- cassandra-trunk
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.cassandra.index.internal.composites;

import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.List;

import org.apache.cassandra.schema.TableMetadata;
import org.apache.cassandra.db.*;
import org.apache.cassandra.db.filter.ClusteringIndexNamesFilter;
import org.apache.cassandra.db.filter.ClusteringIndexSliceFilter;
import org.apache.cassandra.db.filter.DataLimits;
import org.apache.cassandra.db.filter.RowFilter;
import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator;
import org.apache.cassandra.db.rows.*;
import org.apache.cassandra.db.transform.Transformation;
import org.apache.cassandra.index.internal.CassandraIndex;
import org.apache.cassandra.index.internal.CassandraIndexSearcher;
import org.apache.cassandra.index.internal.IndexEntry;
import org.apache.cassandra.utils.btree.BTreeSet;
import org.apache.cassandra.utils.concurrent.OpOrder;


public class CompositesSearcher extends CassandraIndexSearcher
{
    public CompositesSearcher(ReadCommand command,
                              RowFilter.Expression expression,
                              CassandraIndex index)
    {
        super(command, expression, index);
    }

    private boolean isMatchingEntry(DecoratedKey partitionKey, IndexEntry entry, ReadCommand command)
    {
        return command.selectsKey(partitionKey) && command.selectsClustering(partitionKey, entry.indexedEntryClustering);
    }

    private boolean isStaticColumn()
    {
        return index.getIndexedColumn().isStatic();
    }

    protected UnfilteredPartitionIterator queryDataFromIndex(final DecoratedKey indexKey,
                                                             final RowIterator indexHits,
                                                             final ReadCommand command,
                                                             final ReadExecutionController executionController)
    {
        assert indexHits.staticRow() == Rows.EMPTY_STATIC_ROW;

        return new UnfilteredPartitionIterator()
        {
            private IndexEntry nextEntry;

            private UnfilteredRowIterator next;

            public TableMetadata metadata()
            {
                return command.metadata();
            }

            public boolean hasNext()
            {
                return prepareNext();
            }

            public UnfilteredRowIterator next()
            {
                if (next == null)
                    prepareNext();

                UnfilteredRowIterator toReturn = next;
                next = null;
                return toReturn;
            }

            private boolean prepareNext()
            {
                while (true)
                {
                    if (next != null)
                        return true;

                    if (nextEntry == null)
                    {
                        if (!indexHits.hasNext())
                            return false;

                        nextEntry = index.decodeEntry(indexKey, indexHits.next());
                    }

                    SinglePartitionReadCommand dataCmd;
                    DecoratedKey partitionKey = index.baseCfs.decorateKey(nextEntry.indexedKey);
                    List<IndexEntry> entries = new ArrayList<>();
                    if (isStaticColumn())
                    {
                        // The index hit may not match the commad key constraint
                        if (!isMatchingEntry(partitionKey, nextEntry, command)) {
                            nextEntry = indexHits.hasNext() ? index.decodeEntry(indexKey, indexHits.next()) : null;
                            continue;
                        }

                        // If the index is on a static column, we just need to do a full read on the partition.
                        // Note that we want to re-use the command.columnFilter() in case of future change.
                        dataCmd = SinglePartitionReadCommand.create(index.baseCfs.metadata(),
                                                                    command.nowInSec(),
                                                                    command.columnFilter(),
                                                                    RowFilter.NONE,
                                                                    DataLimits.NONE,
                                                                    partitionKey,
                                                                    new ClusteringIndexSliceFilter(Slices.ALL, false));
                        entries.add(nextEntry);
                        nextEntry = indexHits.hasNext() ? index.decodeEntry(indexKey, indexHits.next()) : null;
                    }
                    else
                    {
                        // Gather all index hits belonging to the same partition and query the data for those hits.
                        // TODO: it's much more efficient to do 1 read for all hits to the same partition than doing
                        // 1 read per index hit. However, this basically mean materializing all hits for a partition
                        // in memory so we should consider adding some paging mechanism. However, index hits should
                        // be relatively small so it's much better than the previous code that was materializing all
                        // *data* for a given partition.
                        BTreeSet.Builder<Clustering> clusterings = BTreeSet.builder(index.baseCfs.getComparator());
                        while (nextEntry != null && partitionKey.getKey().equals(nextEntry.indexedKey))
                        {
                            // We're queried a slice of the index, but some hits may not match some of the clustering column constraints
                            if (isMatchingEntry(partitionKey, nextEntry, command))
                            {
                                clusterings.add(nextEntry.indexedEntryClustering);
                                entries.add(nextEntry);
                            }

                            nextEntry = indexHits.hasNext() ? index.decodeEntry(indexKey, indexHits.next()) : null;
                        }

                        // Because we've eliminated entries that don't match the clustering columns, it's possible we added nothing
                        if (clusterings.isEmpty())
                            continue;

                        // Query the gathered index hits. We still need to filter stale hits from the resulting query.
                        ClusteringIndexNamesFilter filter = new ClusteringIndexNamesFilter(clusterings.build(), false);
                        dataCmd = SinglePartitionReadCommand.create(index.baseCfs.metadata(),
                                                                    command.nowInSec(),
                                                                    command.columnFilter(),
                                                                    command.rowFilter(),
                                                                    DataLimits.NONE,
                                                                    partitionKey,
                                                                    filter);
                    }

                    @SuppressWarnings("resource") // We close right away if empty, and if it's assign to next it will be called either
                    // by the next caller of next, or through closing this iterator is this come before.
                    UnfilteredRowIterator dataIter =
                        filterStaleEntries(dataCmd.queryMemtableAndDisk(index.baseCfs, executionController),
                                           indexKey.getKey(),
                                           entries,
                                           executionController.writeOpOrderGroup(),
                                           command.nowInSec());

                    if (dataIter.isEmpty())
                    {
                        dataIter.close();
                        continue;
                    }

                    next = dataIter;
                    return true;
                }
            }

            public void remove()
            {
                throw new UnsupportedOperationException();
            }

            public void close()
            {
                indexHits.close();
                if (next != null)
                    next.close();
            }
        };
    }

    private void deleteAllEntries(final List<IndexEntry> entries, final OpOrder.Group writeOp, final int nowInSec)
    {
        entries.forEach(entry ->
            index.deleteStaleEntry(entry.indexValue,
                                   entry.indexClustering,
                                   new DeletionTime(entry.timestamp, nowInSec),
                                   writeOp));
    }

    // We assume all rows in dataIter belong to the same partition.
    private UnfilteredRowIterator filterStaleEntries(UnfilteredRowIterator dataIter,
                                                     final ByteBuffer indexValue,
                                                     final List<IndexEntry> entries,
                                                     final OpOrder.Group writeOp,
                                                     final int nowInSec)
    {
        // collect stale index entries and delete them when we close this iterator
        final List<IndexEntry> staleEntries = new ArrayList<>();

        // if there is a partition level delete in the base table, we need to filter
        // any index entries which would be shadowed by it
        if (!dataIter.partitionLevelDeletion().isLive())
        {
            DeletionTime deletion = dataIter.partitionLevelDeletion();
            entries.forEach(e -> {
                if (deletion.deletes(e.timestamp))
                    staleEntries.add(e);
            });
        }

        UnfilteredRowIterator iteratorToReturn = null;
        if (isStaticColumn())
        {
            if (entries.size() != 1)
                throw new AssertionError("A partition should have at most one index within a static column index");

            iteratorToReturn = dataIter;
            if (index.isStale(dataIter.staticRow(), indexValue, nowInSec))
            {
                // The entry is staled, we return no rows in this partition.
                staleEntries.addAll(entries);
                iteratorToReturn = UnfilteredRowIterators.noRowsIterator(dataIter.metadata(),
                                                                         dataIter.partitionKey(),
                                                                         Rows.EMPTY_STATIC_ROW,
                                                                         dataIter.partitionLevelDeletion(),
                                                                         dataIter.isReverseOrder());
            }
            deleteAllEntries(staleEntries, writeOp, nowInSec);
        }
        else
        {
            ClusteringComparator comparator = dataIter.metadata().comparator;

            class Transform extends Transformation
            {
                private int entriesIdx;

                @Override
                public Row applyToRow(Row row)
                {
                    IndexEntry entry = findEntry(row.clustering());
                    if (!index.isStale(row, indexValue, nowInSec))
                        return row;

                    staleEntries.add(entry);
                    return null;
                }

                private IndexEntry findEntry(Clustering clustering)
                {
                    assert entriesIdx < entries.size();
                    while (entriesIdx < entries.size())
                    {
                        IndexEntry entry = entries.get(entriesIdx++);
                        // The entries are in clustering order. So that the requested entry should be the
                        // next entry, the one at 'entriesIdx'. However, we can have stale entries, entries
                        // that have no corresponding row in the base table typically because of a range
                        // tombstone or partition level deletion. Delete such stale entries.
                        // For static column, we only need to compare the partition key, otherwise we compare
                        // the whole clustering.
                        int cmp = comparator.compare(entry.indexedEntryClustering, clustering);
                        assert cmp <= 0; // this would means entries are not in clustering order, which shouldn't happen
                        if (cmp == 0)
                            return entry;
                        else
                            staleEntries.add(entry);
                    }
                    // entries correspond to the rows we've queried, so we shouldn't have a row that has no corresponding entry.
                    throw new AssertionError();
                }

                @Override
                public void onPartitionClose()
                {
                    deleteAllEntries(staleEntries, writeOp, nowInSec);
                }
            }
            iteratorToReturn = Transformation.apply(dataIter, new Transform());
        }

        return iteratorToReturn;
    }
}