IndexedSliceReader.java example

Explorer
CassandraQoS-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.cassandra.db.columniterator;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayDeque;
import java.util.Deque;
import java.util.Iterator;
import java.util.List;

import com.google.common.collect.AbstractIterator;

import org.apache.cassandra.db.*;
import org.apache.cassandra.db.filter.ColumnSlice;
import org.apache.cassandra.db.marshal.AbstractType;
import org.apache.cassandra.io.sstable.CorruptSSTableException;
import org.apache.cassandra.io.sstable.IndexHelper;
import org.apache.cassandra.io.sstable.IndexHelper.IndexInfo;
import org.apache.cassandra.io.sstable.SSTableReader;
import org.apache.cassandra.io.util.FileDataInput;
import org.apache.cassandra.io.util.FileMark;
import org.apache.cassandra.tracing.Tracing;
import org.apache.cassandra.utils.ByteBufferUtil;

/**
 * This is a reader that finds the block for a starting column and returns blocks before/after it for each next call.
 * This function assumes that the CF is sorted by name and exploits the name index.
 */
class IndexedSliceReader extends AbstractIterator<OnDiskAtom> implements OnDiskAtomIterator
{
    private final ColumnFamily emptyColumnFamily;

    private final SSTableReader sstable;
    private final List<IndexHelper.IndexInfo> indexes;
    private final FileDataInput originalInput;
    private FileDataInput file;
    private final boolean reversed;
    private final ColumnSlice[] slices;
    private final BlockFetcher fetcher;
    private final Deque<OnDiskAtom> blockColumns = new ArrayDeque<OnDiskAtom>();
    private final AbstractType<?> comparator;

    // Holds range tombstone in reverse queries. See addColumn()
    private final Deque<OnDiskAtom> rangeTombstonesReversed;

    /**
     * This slice reader assumes that slices are sorted correctly, e.g. that for forward lookup slices are in
     * lexicographic order of start elements and that for reverse lookup they are in reverse lexicographic order of
     * finish (reverse start) elements. i.e. forward: [a,b],[d,e],[g,h] reverse: [h,g],[e,d],[b,a]. This reader also
     * assumes that validation has been performed in terms of intervals (no overlapping intervals).
     */
    public IndexedSliceReader(SSTableReader sstable, RowIndexEntry indexEntry, FileDataInput input, ColumnSlice[] slices, boolean reversed)
    {
        Tracing.trace("Seeking to partition indexed section in data file");
        this.sstable = sstable;
        this.originalInput = input;
        this.reversed = reversed;
        this.slices = slices;
        this.comparator = sstable.metadata.comparator;
        this.rangeTombstonesReversed = reversed ? new ArrayDeque<OnDiskAtom>() : null;

        try
        {
            this.indexes = indexEntry.columnsIndex();
            emptyColumnFamily = EmptyColumns.factory.create(sstable.metadata);
            if (indexes.isEmpty())
            {
                setToRowStart(indexEntry, input);
                emptyColumnFamily.delete(DeletionTime.serializer.deserialize(file));
                fetcher = new SimpleBlockFetcher();
            }
            else
            {
                emptyColumnFamily.delete(indexEntry.deletionTime());
                fetcher = new IndexedBlockFetcher(indexEntry.position);
            }
        }
        catch (IOException e)
        {
            sstable.markSuspect();
            throw new CorruptSSTableException(e, file.getPath());
        }
    }

    /**
     * Sets the seek position to the start of the row for column scanning.
     */
    private void setToRowStart(RowIndexEntry rowEntry, FileDataInput in) throws IOException
    {
        if (in == null)
        {
            this.file = sstable.getFileDataInput(rowEntry.position);
        }
        else
        {
            this.file = in;
            in.seek(rowEntry.position);
        }
        sstable.partitioner.decorateKey(ByteBufferUtil.readWithShortLength(file));
        if (sstable.descriptor.version.hasRowSizeAndColumnCount)
            file.readLong();
    }

    public ColumnFamily getColumnFamily()
    {
        return emptyColumnFamily;
    }

    public DecoratedKey getKey()
    {
        throw new UnsupportedOperationException();
    }

    protected OnDiskAtom computeNext()
    {
        while (true)
        {
            if (reversed)
            {
                // Return all tombstone for the block first (see addColumn() below)
                OnDiskAtom column = rangeTombstonesReversed.poll();
                if (column != null)
                    return column;
            }

            OnDiskAtom column = blockColumns.poll();
            if (column == null)
            {
                if (!fetcher.fetchMoreData())
                    return endOfData();
            }
            else
            {
                return column;
            }
        }
    }

    public void close() throws IOException
    {
        if (originalInput == null && file != null)
            file.close();
    }

    protected void addColumn(OnDiskAtom col)
    {
        if (reversed)
        {
            /*
             * We put range tomstone markers at the beginning of the range they delete. But for reversed queries,
             * the caller still need to know about a RangeTombstone before it sees any column that it covers.
             * To make that simple, we keep said tombstones separate and return them all before any column for
             * a given block.
             */
            if (col instanceof RangeTombstone)
                rangeTombstonesReversed.addFirst(col);
            else
                blockColumns.addFirst(col);
        }
        else
        {
            blockColumns.addLast(col);
        }
    }

    static int indexFor(SSTableReader sstable, ByteBuffer name, List<IndexHelper.IndexInfo> indexes, AbstractType<?> comparator, boolean reversed, int startIdx)
    {
        // If it's a super CF and the sstable is from the old format, then the index will contain old format info, i.e. non composite
        // SC names. So we need to 1) use only the SC name part of the comparator and 2) extract only that part from 'name'
        if (sstable.metadata.isSuper() && sstable.descriptor.version.hasSuperColumns)
        {
            AbstractType<?> scComparator = SuperColumns.getComparatorFor(sstable.metadata, false);
            ByteBuffer scName = SuperColumns.scName(name);
            return IndexHelper.indexFor(scName, indexes, scComparator, reversed, startIdx);
        }
        return IndexHelper.indexFor(name, indexes, comparator, reversed, startIdx);
    }

    static ByteBuffer forIndexComparison(SSTableReader sstable, ByteBuffer name)
    {
        // See indexFor above.
        return sstable.metadata.isSuper() && sstable.descriptor.version.hasSuperColumns
             ? SuperColumns.scName(name)
             : name;
    }

    static AbstractType<?> comparatorForIndex(SSTableReader sstable, AbstractType<?> comparator)
    {
        return sstable.metadata.isSuper() && sstable.descriptor.version.hasSuperColumns
             ? SuperColumns.getComparatorFor(sstable.metadata, false)
             : comparator;
    }

    private abstract class BlockFetcher
    {
        protected int currentSliceIdx;

        protected BlockFetcher(int sliceIdx)
        {
            this.currentSliceIdx = sliceIdx;
        }

        /*
         * Return the smallest key selected by the current ColumnSlice.
         */
        protected ByteBuffer currentStart()
        {
            return reversed ? slices[currentSliceIdx].finish : slices[currentSliceIdx].start;
        }

        /*
         * Return the biggest key selected by the current ColumnSlice.
         */
        protected ByteBuffer currentFinish()
        {
            return reversed ? slices[currentSliceIdx].start : slices[currentSliceIdx].finish;
        }

        protected abstract boolean setNextSlice();

        protected abstract boolean fetchMoreData();

        protected boolean isColumnBeforeSliceStart(OnDiskAtom column)
        {
            return isBeforeSliceStart(column.name());
        }

        protected boolean isBeforeSliceStart(ByteBuffer name)
        {
            ByteBuffer start = currentStart();
            return start.remaining() != 0 && comparator.compare(name, start) < 0;
        }

        protected boolean isIndexEntryBeforeSliceStart(ByteBuffer name)
        {
            ByteBuffer start = currentStart();
            return start.remaining() != 0 && comparatorForIndex(sstable, comparator).compare(name, forIndexComparison(sstable, start)) < 0;
        }

        protected boolean isColumnBeforeSliceFinish(OnDiskAtom column)
        {
            ByteBuffer finish = currentFinish();
            return finish.remaining() == 0 || comparator.compare(column.name(), finish) <= 0;
        }

        protected boolean isIndexEntryAfterSliceFinish(ByteBuffer name)
        {
            ByteBuffer finish = currentFinish();
            return finish.remaining() != 0 && comparatorForIndex(sstable, comparator).compare(name, forIndexComparison(sstable, finish)) > 0;
        }
    }

    private class IndexedBlockFetcher extends BlockFetcher
    {
        // where this row starts
        private final long columnsStart;

        // the index entry for the next block to deserialize
        private int nextIndexIdx = -1;

        // index of the last block we've read from disk;
        private int lastDeserializedBlock = -1;

        // For reversed, keep columns at the beginning of the last deserialized block that
        // may still match a slice
        private final Deque<OnDiskAtom> prefetched;

        public IndexedBlockFetcher(long columnsStart)
        {
            super(-1);
            this.columnsStart = columnsStart;
            this.prefetched = reversed ? new ArrayDeque<OnDiskAtom>() : null;
            setNextSlice();
        }

        protected boolean setNextSlice()
        {
            while (++currentSliceIdx < slices.length)
            {
                nextIndexIdx = indexFor(sstable, slices[currentSliceIdx].start, indexes, comparator, reversed, nextIndexIdx);
                if (nextIndexIdx < 0 || nextIndexIdx >= indexes.size())
                    // no index block for that slice
                    continue;

                // Check if we can exclude this slice entirely from the index
                IndexInfo info = indexes.get(nextIndexIdx);
                if (reversed)
                {
                    if (!isIndexEntryBeforeSliceStart(info.lastName))
                        return true;
                }
                else
                {
                    if (!isIndexEntryAfterSliceFinish(info.firstName))
                        return true;
                }
            }
            nextIndexIdx = -1;
            return false;
        }

        protected boolean hasMoreSlice()
        {
            return currentSliceIdx < slices.length;
        }

        protected boolean fetchMoreData()
        {
            if (!hasMoreSlice())
                return false;

            // If we read blocks in reversed disk order, we may have columns from the previous block to handle.
            // Note that prefetched keeps columns in reversed disk order.
            // Also note that Range Tombstone handling is a bit tricky, because we may run into range tombstones
            // that cover a slice *after* we've move to the previous slice. To keep it simple, we simply include
            // every RT in prefetched: it's only slightly inefficient to do so and there is only so much RT that
            // can be mistakenly added this way.
            if (reversed && !prefetched.isEmpty())
            {
                // Avoids some comparison when we know it's not useful
                boolean inSlice = false;

                OnDiskAtom prefetchedCol;
                while ((prefetchedCol = prefetched.peek()) != null)
                {
                    // col is before slice, we update the slice
                    if (isColumnBeforeSliceStart(prefetchedCol))
                    {
                        inSlice = false;

                        // As explained above, we add RT unconditionally
                        if (prefetchedCol instanceof RangeTombstone)
                        {
                            blockColumns.addLast(prefetched.poll());
                            continue;
                        }

                        // Otherwise, we either move to the next slice. If we have no more slice, then
                        // simply unwind prefetched entirely and add all RT.
                        if (!setNextSlice())
                        {
                            while ((prefetchedCol = prefetched.poll()) != null)
                                if (prefetchedCol instanceof RangeTombstone)
                                    blockColumns.addLast(prefetchedCol);
                            break;
                        }

                    }
                    // col is within slice, all columns
                    // (we go in reverse, so as soon as we are in a slice, no need to check
                    // we're after the slice until we change slice)
                    else if (inSlice || isColumnBeforeSliceFinish(prefetchedCol))
                    {
                        blockColumns.addLast(prefetched.poll());
                        inSlice = true;
                    }
                    // if col is after slice, ignore
                    else
                    {
                        prefetched.poll();
                    }
                }

                if (!blockColumns.isEmpty())
                    return true;
                else if (!hasMoreSlice())
                    return false;
            }
            try
            {
                return getNextBlock();
            }
            catch (IOException e)
            {
                throw new CorruptSSTableException(e, file.getPath());
            }
        }

        private boolean getNextBlock() throws IOException
        {
            if (lastDeserializedBlock == nextIndexIdx)
            {
                if (reversed)
                    nextIndexIdx--;
                else
                    nextIndexIdx++;
            }
            lastDeserializedBlock = nextIndexIdx;

            // Are we done?
            if (lastDeserializedBlock < 0 || lastDeserializedBlock >= indexes.size())
                return false;

            IndexInfo currentIndex = indexes.get(lastDeserializedBlock);

            /* seek to the correct offset to the data, and calculate the data size */
            long positionToSeek = columnsStart + currentIndex.offset;

            // With new promoted indexes, our first seek in the data file will happen at that point.
            if (file == null)
                file = originalInput == null ? sstable.getFileDataInput(positionToSeek) : originalInput;

            // Give a bogus atom count since we'll deserialize as long as we're
            // within the index block but we don't know how much atom is there
            Iterator<OnDiskAtom> atomIterator = emptyColumnFamily.metadata().getOnDiskIterator(file, Integer.MAX_VALUE, sstable.descriptor.version);
            file.seek(positionToSeek);
            FileMark mark = file.mark();

            // We remenber when we are whithin a slice to avoid some comparison
            boolean inSlice = false;

            // scan from index start
            OnDiskAtom column = null;
            while (file.bytesPastMark(mark) < currentIndex.width || column != null)
            {
                // Only fetch a new column if we haven't dealt with the previous one.
                if (column == null)
                    column = atomIterator.next();

                // col is before slice
                // (If in slice, don't bother checking that until we change slice)
                if (!inSlice && isColumnBeforeSliceStart(column))
                {
                    // If it's a rangeTombstone, then we need to read it and include it unless it's end
                    // stops before our slice start.
                    if (column instanceof RangeTombstone && !isBeforeSliceStart(((RangeTombstone)column).max))
                    {
                        addColumn(column);
                    }
                    else if (reversed)
                    {
                        // the next slice select columns that are before the current one, so it may
                        // match this column, so keep it around.
                        prefetched.addFirst(column);
                    }

                    column = null;
                }
                // col is within slice
                else if (isColumnBeforeSliceFinish(column))
                {
                    inSlice = true;
                    addColumn(column);
                    column = null;
                }
                // col is after slice.
                else
                {
                    // When reading forward, if we hit a column that sorts after the current slice, it means we're done with this slice.
                    // For reversed, this may either mean that we're done with the current slice, or that we need to read the previous
                    // index block. However, we can be sure that we are in the first case though (the current slice is done) if the first
                    // columns of the block were not part of the current slice, i.e. if we have columns in prefetched.
                    if (reversed && prefetched.isEmpty())
                        break;

                    if (!setNextSlice())
                        break;

                    inSlice = false;

                    // The next index block now corresponds to the first block that may have columns for the newly set slice.
                    // So if it's different from the current block, we're done with this block. And in that case, we know
                    // that our prefetched columns won't match.
                    if (nextIndexIdx != lastDeserializedBlock)
                    {
                        if (reversed)
                            prefetched.clear();
                        break;
                    }

                    // Even if the next slice may have column in this blocks, if we're reversed, those columns have been
                    // prefetched and we're done with that block
                    if (reversed)
                        break;

                    // otherwise, we will deal with that column at the next iteration
                }
            }
            return true;
        }
    }

    private class SimpleBlockFetcher extends BlockFetcher
    {
        public SimpleBlockFetcher() throws IOException
        {
            // Since we have to deserialize in order and will read all slices might as well reverse the slices and
            // behave as if it was not reversed
            super(reversed ? slices.length - 1 : 0);

            // We remenber when we are whithin a slice to avoid some comparison
            boolean inSlice = false;

            int columnCount = sstable.descriptor.version.hasRowSizeAndColumnCount ? file.readInt() : Integer.MAX_VALUE;
            Iterator<OnDiskAtom> atomIterator = emptyColumnFamily.metadata().getOnDiskIterator(file, columnCount, sstable.descriptor.version);
            OnDiskAtom column = null;
            while (atomIterator.hasNext() || column != null)
            {
                // Only fetch a new column if we haven't dealt with the previous one.
                if (column == null)
                    column = atomIterator.next();

                // col is before slice
                // (If in slice, don't bother checking that until we change slice)
                if (!inSlice && isColumnBeforeSliceStart(column))
                {
                    // If it's a rangeTombstone, then we need to read it and include it unless it's end
                    // stops before our slice start.
                    if (column instanceof RangeTombstone && !isBeforeSliceStart(((RangeTombstone)column).max))
                        addColumn(column);

                    column = null;
                    continue;
                }

                // col is within slice
                if (isColumnBeforeSliceFinish(column))
                {
                    inSlice = true;
                    addColumn(column);
                    column = null;
                }
                // col is after slice. more slices?
                else
                {
                    inSlice = false;
                    if (!setNextSlice())
                        break;
                }
            }
        }

        protected boolean setNextSlice()
        {
            if (reversed)
            {
                if (currentSliceIdx <= 0)
                    return false;

                currentSliceIdx--;
            }
            else
            {
                if (currentSliceIdx >= slices.length - 1)
                    return false;

                currentSliceIdx++;
            }
            return true;
        }

        protected boolean fetchMoreData()
        {
            return false;
        }
    }
}