/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.cassandra.index.sasi.disk; import java.io.*; import java.nio.ByteBuffer; import java.util.*; import java.util.stream.Collectors; import org.apache.cassandra.db.DecoratedKey; import org.apache.cassandra.index.sasi.Term; import org.apache.cassandra.index.sasi.plan.Expression; import org.apache.cassandra.index.sasi.plan.Expression.Op; import org.apache.cassandra.index.sasi.utils.MappedBuffer; import org.apache.cassandra.index.sasi.utils.RangeUnionIterator; import org.apache.cassandra.index.sasi.utils.AbstractIterator; import org.apache.cassandra.index.sasi.utils.RangeIterator; import org.apache.cassandra.db.marshal.AbstractType; import org.apache.cassandra.io.FSReadError; import org.apache.cassandra.io.util.ChannelProxy; import org.apache.cassandra.io.util.FileUtils; import org.apache.cassandra.utils.ByteBufferUtil; import org.apache.cassandra.utils.FBUtilities; import com.google.common.base.Function; import com.google.common.collect.Iterables; import com.google.common.collect.Iterators; import com.google.common.collect.PeekingIterator; import static org.apache.cassandra.index.sasi.disk.OnDiskBlock.SearchResult; public class OnDiskIndex implements Iterable<OnDiskIndex.DataTerm>, Closeable { public enum IteratorOrder { DESC(1), ASC(-1); public final int step; IteratorOrder(int step) { this.step = step; } public int startAt(OnDiskBlock<DataTerm> block, Expression e) { switch (this) { case DESC: return e.lower == null ? 0 : startAt(block.search(e.validator, e.lower.value), e.lower.inclusive); case ASC: return e.upper == null ? block.termCount() - 1 : startAt(block.search(e.validator, e.upper.value), e.upper.inclusive); default: throw new IllegalArgumentException("Unknown order: " + this); } } public int startAt(SearchResult<DataTerm> found, boolean inclusive) { switch (this) { case DESC: if (found.cmp < 0) return found.index + 1; return inclusive || found.cmp != 0 ? found.index : found.index + 1; case ASC: if (found.cmp < 0) // search term was bigger then whole data set return found.index; return inclusive && (found.cmp == 0 || found.cmp < 0) ? found.index : found.index - 1; default: throw new IllegalArgumentException("Unknown order: " + this); } } } public final Descriptor descriptor; protected final OnDiskIndexBuilder.Mode mode; protected final OnDiskIndexBuilder.TermSize termSize; protected final AbstractType<?> comparator; protected final MappedBuffer indexFile; protected final long indexSize; protected final boolean hasMarkedPartials; protected final Function<Long, DecoratedKey> keyFetcher; protected final String indexPath; protected final PointerLevel[] levels; protected final DataLevel dataLevel; protected final ByteBuffer minTerm, maxTerm, minKey, maxKey; @SuppressWarnings("resource") public OnDiskIndex(File index, AbstractType<?> cmp, Function<Long, DecoratedKey> keyReader) { keyFetcher = keyReader; comparator = cmp; indexPath = index.getAbsolutePath(); RandomAccessFile backingFile = null; try { backingFile = new RandomAccessFile(index, "r"); descriptor = new Descriptor(backingFile.readUTF()); termSize = OnDiskIndexBuilder.TermSize.of(backingFile.readShort()); minTerm = ByteBufferUtil.readWithShortLength(backingFile); maxTerm = ByteBufferUtil.readWithShortLength(backingFile); minKey = ByteBufferUtil.readWithShortLength(backingFile); maxKey = ByteBufferUtil.readWithShortLength(backingFile); mode = OnDiskIndexBuilder.Mode.mode(backingFile.readUTF()); hasMarkedPartials = backingFile.readBoolean(); indexSize = backingFile.length(); indexFile = new MappedBuffer(new ChannelProxy(indexPath, backingFile.getChannel())); // start of the levels indexFile.position(indexFile.getLong(indexSize - 8)); int numLevels = indexFile.getInt(); levels = new PointerLevel[numLevels]; for (int i = 0; i < levels.length; i++) { int blockCount = indexFile.getInt(); levels[i] = new PointerLevel(indexFile.position(), blockCount); indexFile.position(indexFile.position() + blockCount * 8); } int blockCount = indexFile.getInt(); dataLevel = new DataLevel(indexFile.position(), blockCount); } catch (IOException e) { throw new FSReadError(e, index); } finally { FileUtils.closeQuietly(backingFile); } } public boolean hasMarkedPartials() { return hasMarkedPartials; } public OnDiskIndexBuilder.Mode mode() { return mode; } public ByteBuffer minTerm() { return minTerm; } public ByteBuffer maxTerm() { return maxTerm; } public ByteBuffer minKey() { return minKey; } public ByteBuffer maxKey() { return maxKey; } public DataTerm min() { return dataLevel.getBlock(0).getTerm(0); } public DataTerm max() { DataBlock block = dataLevel.getBlock(dataLevel.blockCount - 1); return block.getTerm(block.termCount() - 1); } /** * Search for rows which match all of the terms inside the given expression in the index file. * * @param exp The expression to use for the query. * * @return Iterator which contains rows for all of the terms from the given range. */ public RangeIterator<Long, Token> search(Expression exp) { assert mode.supports(exp.getOp()); if (exp.getOp() == Expression.Op.PREFIX && mode == OnDiskIndexBuilder.Mode.CONTAINS && !hasMarkedPartials) throw new UnsupportedOperationException("prefix queries in CONTAINS mode are not supported by this index"); // optimization in case single term is requested from index // we don't really need to build additional union iterator if (exp.getOp() == Op.EQ) { DataTerm term = getTerm(exp.lower.value); return term == null ? null : term.getTokens(); } // convert single NOT_EQ to range with exclusion final Expression expression = (exp.getOp() != Op.NOT_EQ) ? exp : new Expression(exp).setOp(Op.RANGE) .setLower(new Expression.Bound(minTerm, true)) .setUpper(new Expression.Bound(maxTerm, true)) .addExclusion(exp.lower.value); List<ByteBuffer> exclusions = new ArrayList<>(expression.exclusions.size()); Iterables.addAll(exclusions, expression.exclusions.stream().filter(exclusion -> { // accept only exclusions which are in the bounds of lower/upper return !(expression.lower != null && comparator.compare(exclusion, expression.lower.value) < 0) && !(expression.upper != null && comparator.compare(exclusion, expression.upper.value) > 0); }).collect(Collectors.toList())); Collections.sort(exclusions, comparator); if (exclusions.size() == 0) return searchRange(expression); List<Expression> ranges = new ArrayList<>(exclusions.size()); // calculate range splits based on the sorted exclusions Iterator<ByteBuffer> exclusionsIterator = exclusions.iterator(); Expression.Bound min = expression.lower, max = null; while (exclusionsIterator.hasNext()) { max = new Expression.Bound(exclusionsIterator.next(), false); ranges.add(new Expression(expression).setOp(Op.RANGE).setLower(min).setUpper(max)); min = max; } assert max != null; ranges.add(new Expression(expression).setOp(Op.RANGE).setLower(max).setUpper(expression.upper)); RangeUnionIterator.Builder<Long, Token> builder = RangeUnionIterator.builder(); for (Expression e : ranges) { @SuppressWarnings("resource") RangeIterator<Long, Token> range = searchRange(e); if (range != null) builder.add(range); } return builder.build(); } private RangeIterator<Long, Token> searchRange(Expression range) { Expression.Bound lower = range.lower; Expression.Bound upper = range.upper; int lowerBlock = lower == null ? 0 : getDataBlock(lower.value); int upperBlock = upper == null ? dataLevel.blockCount - 1 // optimization so we don't have to fetch upperBlock when query has lower == upper : (lower != null && comparator.compare(lower.value, upper.value) == 0) ? lowerBlock : getDataBlock(upper.value); return (mode != OnDiskIndexBuilder.Mode.SPARSE || lowerBlock == upperBlock || upperBlock - lowerBlock <= 1) ? searchPoint(lowerBlock, range) : searchRange(lowerBlock, lower, upperBlock, upper); } private RangeIterator<Long, Token> searchRange(int lowerBlock, Expression.Bound lower, int upperBlock, Expression.Bound upper) { // if lower is at the beginning of the block that means we can just do a single iterator per block SearchResult<DataTerm> lowerPosition = (lower == null) ? null : searchIndex(lower.value, lowerBlock); SearchResult<DataTerm> upperPosition = (upper == null) ? null : searchIndex(upper.value, upperBlock); RangeUnionIterator.Builder<Long, Token> builder = RangeUnionIterator.builder(); // optimistically assume that first and last blocks are full block reads, saves at least 3 'else' conditions int firstFullBlockIdx = lowerBlock, lastFullBlockIdx = upperBlock; // 'lower' doesn't cover the whole block so we need to do a partial iteration // Two reasons why that can happen: // - 'lower' is not the first element of the block // - 'lower' is first element but it's not inclusive in the query if (lowerPosition != null && (lowerPosition.index > 0 || !lower.inclusive)) { DataBlock block = dataLevel.getBlock(lowerBlock); int start = (lower.inclusive || lowerPosition.cmp != 0) ? lowerPosition.index : lowerPosition.index + 1; builder.add(block.getRange(start, block.termCount())); firstFullBlockIdx = lowerBlock + 1; } if (upperPosition != null) { DataBlock block = dataLevel.getBlock(upperBlock); int lastIndex = block.termCount() - 1; // The save as with 'lower' but here we need to check if the upper is the last element of the block, // which means that we only have to get individual results if: // - if it *is not* the last element, or // - it *is* but shouldn't be included (dictated by upperInclusive) if (upperPosition.index != lastIndex || !upper.inclusive) { int end = (upperPosition.cmp < 0 || (upperPosition.cmp == 0 && upper.inclusive)) ? upperPosition.index + 1 : upperPosition.index; builder.add(block.getRange(0, end)); lastFullBlockIdx = upperBlock - 1; } } int totalSuperBlocks = (lastFullBlockIdx - firstFullBlockIdx) / OnDiskIndexBuilder.SUPER_BLOCK_SIZE; // if there are no super-blocks, we can simply read all of the block iterators in sequence if (totalSuperBlocks == 0) { for (int i = firstFullBlockIdx; i <= lastFullBlockIdx; i++) builder.add(dataLevel.getBlock(i).getBlockIndex().iterator(keyFetcher)); return builder.build(); } // first get all of the blocks which are aligned before the first super-block in the sequence, // e.g. if the block range was (1, 9) and super-block-size = 4, we need to read 1, 2, 3, 4 - 7 is covered by // super-block, 8, 9 is a remainder. int superBlockAlignedStart = firstFullBlockIdx == 0 ? 0 : (int) FBUtilities.align(firstFullBlockIdx, OnDiskIndexBuilder.SUPER_BLOCK_SIZE); for (int blockIdx = firstFullBlockIdx; blockIdx < Math.min(superBlockAlignedStart, lastFullBlockIdx); blockIdx++) builder.add(getBlockIterator(blockIdx)); // now read all of the super-blocks matched by the request, from the previous comment // it's a block with index 1 (which covers everything from 4 to 7) int superBlockIdx = superBlockAlignedStart / OnDiskIndexBuilder.SUPER_BLOCK_SIZE; for (int offset = 0; offset < totalSuperBlocks - 1; offset++) builder.add(dataLevel.getSuperBlock(superBlockIdx++).iterator()); // now it's time for a remainder read, again from the previous example it's 8, 9 because // we have over-shot previous block but didn't request enough to cover next super-block. int lastCoveredBlock = superBlockIdx * OnDiskIndexBuilder.SUPER_BLOCK_SIZE; for (int offset = 0; offset <= (lastFullBlockIdx - lastCoveredBlock); offset++) builder.add(getBlockIterator(lastCoveredBlock + offset)); return builder.build(); } private RangeIterator<Long, Token> searchPoint(int lowerBlock, Expression expression) { Iterator<DataTerm> terms = new TermIterator(lowerBlock, expression, IteratorOrder.DESC); RangeUnionIterator.Builder<Long, Token> builder = RangeUnionIterator.builder(); while (terms.hasNext()) { try { builder.add(terms.next().getTokens()); } finally { expression.checkpoint(); } } return builder.build(); } private RangeIterator<Long, Token> getBlockIterator(int blockIdx) { DataBlock block = dataLevel.getBlock(blockIdx); return (block.hasCombinedIndex) ? block.getBlockIndex().iterator(keyFetcher) : block.getRange(0, block.termCount()); } public Iterator<DataTerm> iteratorAt(ByteBuffer query, IteratorOrder order, boolean inclusive) { Expression e = new Expression("", comparator); Expression.Bound bound = new Expression.Bound(query, inclusive); switch (order) { case DESC: e.setLower(bound); break; case ASC: e.setUpper(bound); break; default: throw new IllegalArgumentException("Unknown order: " + order); } return new TermIterator(levels.length == 0 ? 0 : getBlockIdx(findPointer(query), query), e, order); } private int getDataBlock(ByteBuffer query) { return levels.length == 0 ? 0 : getBlockIdx(findPointer(query), query); } public Iterator<DataTerm> iterator() { return new TermIterator(0, new Expression("", comparator), IteratorOrder.DESC); } public void close() throws IOException { FileUtils.closeQuietly(indexFile); } private PointerTerm findPointer(ByteBuffer query) { PointerTerm ptr = null; for (PointerLevel level : levels) { if ((ptr = level.getPointer(ptr, query)) == null) return null; } return ptr; } private DataTerm getTerm(ByteBuffer query) { SearchResult<DataTerm> term = searchIndex(query, getDataBlock(query)); return term.cmp == 0 ? term.result : null; } private SearchResult<DataTerm> searchIndex(ByteBuffer query, int blockIdx) { return dataLevel.getBlock(blockIdx).search(comparator, query); } private int getBlockIdx(PointerTerm ptr, ByteBuffer query) { int blockIdx = 0; if (ptr != null) { int cmp = ptr.compareTo(comparator, query); blockIdx = (cmp == 0 || cmp > 0) ? ptr.getBlock() : ptr.getBlock() + 1; } return blockIdx; } protected class PointerLevel extends Level<PointerBlock> { public PointerLevel(long offset, int count) { super(offset, count); } public PointerTerm getPointer(PointerTerm parent, ByteBuffer query) { return getBlock(getBlockIdx(parent, query)).search(comparator, query).result; } protected PointerBlock cast(MappedBuffer block) { return new PointerBlock(block); } } protected class DataLevel extends Level<DataBlock> { protected final int superBlockCnt; protected final long superBlocksOffset; public DataLevel(long offset, int count) { super(offset, count); long baseOffset = blockOffsets + blockCount * 8; superBlockCnt = indexFile.getInt(baseOffset); superBlocksOffset = baseOffset + 4; } protected DataBlock cast(MappedBuffer block) { return new DataBlock(block); } public OnDiskSuperBlock getSuperBlock(int idx) { assert idx < superBlockCnt : String.format("requested index %d is greater than super block count %d", idx, superBlockCnt); long blockOffset = indexFile.getLong(superBlocksOffset + idx * 8); return new OnDiskSuperBlock(indexFile.duplicate().position(blockOffset)); } } protected class OnDiskSuperBlock { private final TokenTree tokenTree; public OnDiskSuperBlock(MappedBuffer buffer) { tokenTree = new TokenTree(descriptor, buffer); } public RangeIterator<Long, Token> iterator() { return tokenTree.iterator(keyFetcher); } } protected abstract class Level<T extends OnDiskBlock> { protected final long blockOffsets; protected final int blockCount; public Level(long offsets, int count) { this.blockOffsets = offsets; this.blockCount = count; } public T getBlock(int idx) throws FSReadError { assert idx >= 0 && idx < blockCount; // calculate block offset and move there // (long is intentional, we'll just need mmap implementation which supports long positions) long blockOffset = indexFile.getLong(blockOffsets + idx * 8); return cast(indexFile.duplicate().position(blockOffset)); } protected abstract T cast(MappedBuffer block); } protected class DataBlock extends OnDiskBlock<DataTerm> { public DataBlock(MappedBuffer data) { super(descriptor, data, BlockType.DATA); } protected DataTerm cast(MappedBuffer data) { return new DataTerm(data, termSize, getBlockIndex()); } public RangeIterator<Long, Token> getRange(int start, int end) { RangeUnionIterator.Builder<Long, Token> builder = RangeUnionIterator.builder(); NavigableMap<Long, Token> sparse = new TreeMap<>(); for (int i = start; i < end; i++) { DataTerm term = getTerm(i); if (term.isSparse()) { NavigableMap<Long, Token> tokens = term.getSparseTokens(); for (Map.Entry<Long, Token> t : tokens.entrySet()) { Token token = sparse.get(t.getKey()); if (token == null) sparse.put(t.getKey(), t.getValue()); else token.merge(t.getValue()); } } else { builder.add(term.getTokens()); } } PrefetchedTokensIterator prefetched = sparse.isEmpty() ? null : new PrefetchedTokensIterator(sparse); if (builder.rangeCount() == 0) return prefetched; builder.add(prefetched); return builder.build(); } } protected class PointerBlock extends OnDiskBlock<PointerTerm> { public PointerBlock(MappedBuffer block) { super(descriptor, block, BlockType.POINTER); } protected PointerTerm cast(MappedBuffer data) { return new PointerTerm(data, termSize, hasMarkedPartials); } } public class DataTerm extends Term implements Comparable<DataTerm> { private final TokenTree perBlockIndex; protected DataTerm(MappedBuffer content, OnDiskIndexBuilder.TermSize size, TokenTree perBlockIndex) { super(content, size, hasMarkedPartials); this.perBlockIndex = perBlockIndex; } public RangeIterator<Long, Token> getTokens() { final long blockEnd = FBUtilities.align(content.position(), OnDiskIndexBuilder.BLOCK_SIZE); if (isSparse()) return new PrefetchedTokensIterator(getSparseTokens()); long offset = blockEnd + 4 + content.getInt(getDataOffset() + 1); return new TokenTree(descriptor, indexFile.duplicate().position(offset)).iterator(keyFetcher); } public boolean isSparse() { return content.get(getDataOffset()) > 0; } public NavigableMap<Long, Token> getSparseTokens() { long ptrOffset = getDataOffset(); byte size = content.get(ptrOffset); assert size > 0; NavigableMap<Long, Token> individualTokens = new TreeMap<>(); for (int i = 0; i < size; i++) { Token token = perBlockIndex.get(content.getLong(ptrOffset + 1 + (8 * i)), keyFetcher); assert token != null; individualTokens.put(token.get(), token); } return individualTokens; } public int compareTo(DataTerm other) { return other == null ? 1 : compareTo(comparator, other.getTerm()); } } protected static class PointerTerm extends Term { public PointerTerm(MappedBuffer content, OnDiskIndexBuilder.TermSize size, boolean hasMarkedPartials) { super(content, size, hasMarkedPartials); } public int getBlock() { return content.getInt(getDataOffset()); } } private static class PrefetchedTokensIterator extends RangeIterator<Long, Token> { private final NavigableMap<Long, Token> tokens; private PeekingIterator<Token> currentIterator; public PrefetchedTokensIterator(NavigableMap<Long, Token> tokens) { super(tokens.firstKey(), tokens.lastKey(), tokens.size()); this.tokens = tokens; this.currentIterator = Iterators.peekingIterator(tokens.values().iterator()); } protected Token computeNext() { return currentIterator != null && currentIterator.hasNext() ? currentIterator.next() : endOfData(); } protected void performSkipTo(Long nextToken) { currentIterator = Iterators.peekingIterator(tokens.tailMap(nextToken, true).values().iterator()); } public void close() throws IOException { endOfData(); } } public AbstractType<?> getComparator() { return comparator; } public String getIndexPath() { return indexPath; } private class TermIterator extends AbstractIterator<DataTerm> { private final Expression e; private final IteratorOrder order; protected OnDiskBlock<DataTerm> currentBlock; protected int blockIndex, offset; private boolean checkLower = true, checkUpper = true; public TermIterator(int startBlock, Expression expression, IteratorOrder order) { this.e = expression; this.order = order; this.blockIndex = startBlock; nextBlock(); } protected DataTerm computeNext() { for (;;) { if (currentBlock == null) return endOfData(); if (offset >= 0 && offset < currentBlock.termCount()) { DataTerm currentTerm = currentBlock.getTerm(nextOffset()); // we need to step over all of the partial terms, in PREFIX mode, // encountered by the query until upper-bound tells us to stop if (e.getOp() == Op.PREFIX && currentTerm.isPartial()) continue; // haven't reached the start of the query range yet, let's // keep skip the current term until lower bound is satisfied if (checkLower && !e.isLowerSatisfiedBy(currentTerm)) continue; // flip the flag right on the first bounds match // to avoid expensive comparisons checkLower = false; if (checkUpper && !e.isUpperSatisfiedBy(currentTerm)) return endOfData(); return currentTerm; } nextBlock(); } } protected void nextBlock() { currentBlock = null; if (blockIndex < 0 || blockIndex >= dataLevel.blockCount) return; currentBlock = dataLevel.getBlock(nextBlockIndex()); offset = checkLower ? order.startAt(currentBlock, e) : currentBlock.minOffset(order); // let's check the last term of the new block right away // if expression's upper bound is satisfied by it such means that we can avoid // doing any expensive upper bound checks for that block. checkUpper = e.hasUpper() && !e.isUpperSatisfiedBy(currentBlock.getTerm(currentBlock.maxOffset(order))); } protected int nextBlockIndex() { int current = blockIndex; blockIndex += order.step; return current; } protected int nextOffset() { int current = offset; offset += order.step; return current; } } }