/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.cassandra.db; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.nio.ByteBuffer; import java.util.Arrays; import java.util.Comparator; import java.util.Iterator; import com.google.common.collect.AbstractIterator; import org.apache.cassandra.io.IVersionedSerializer; import org.apache.cassandra.net.MessagingService; import org.apache.cassandra.utils.ByteBufferUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Data structure holding the range tombstones of a ColumnFamily. * <p> * This is essentially a sorted list of non-overlapping (tombstone) ranges. * <p> * A range tombstone has 4 elements: the start and end of the range covered, * and the deletion infos (markedAt timestamp and local deletion time). The * markedAt timestamp is what define the priority of 2 overlapping tombstones. * That is, given 2 tombstones [0, 10]@t1 and [5, 15]@t2, then if t2 > t1 (and * are the tombstones markedAt values), the 2nd tombstone take precedence over * the first one on [5, 10]. If such tombstones are added to a RangeTombstoneList, * the range tombstone list will store them as [[0, 5]@t1, [5, 15]@t2]. * <p> * The only use of the local deletion time is to know when a given tombstone can * be purged, which will be done by the purge() method. */ public class RangeTombstoneList implements Iterable<RangeTombstone> { private static final Logger logger = LoggerFactory.getLogger(RangeTombstoneList.class); public static final Serializer serializer = new Serializer(); private final Comparator<ByteBuffer> comparator; // Note: we don't want to use a List for the markedAts and delTimes to avoid boxing. We could // use a List for starts and ends, but having arrays everywhere is almost simpler. private ByteBuffer[] starts; private ByteBuffer[] ends; private long[] markedAts; private int[] delTimes; private int size; private RangeTombstoneList(Comparator<ByteBuffer> comparator, ByteBuffer[] starts, ByteBuffer[] ends, long[] markedAts, int[] delTimes, int size) { assert starts.length == ends.length && starts.length == markedAts.length && starts.length == delTimes.length; this.comparator = comparator; this.starts = starts; this.ends = ends; this.markedAts = markedAts; this.delTimes = delTimes; this.size = size; } public RangeTombstoneList(Comparator<ByteBuffer> comparator, int capacity) { this(comparator, new ByteBuffer[capacity], new ByteBuffer[capacity], new long[capacity], new int[capacity], 0); } public boolean isEmpty() { return size == 0; } public int size() { return size; } public Comparator<ByteBuffer> comparator() { return comparator; } public RangeTombstoneList copy() { return new RangeTombstoneList(comparator, Arrays.copyOf(starts, size), Arrays.copyOf(ends, size), Arrays.copyOf(markedAts, size), Arrays.copyOf(delTimes, size), size); } public void add(RangeTombstone tombstone) { add(tombstone.min, tombstone.max, tombstone.data.markedForDeleteAt, tombstone.data.localDeletionTime); } /** * Adds a new range tombstone. * * This method will be faster if the new tombstone sort after all the currently existing ones (this is a common use case), * but it doesn't assume it. */ public void add(ByteBuffer start, ByteBuffer end, long markedAt, int delTime) { if (isEmpty()) { addInternal(0, start, end, markedAt, delTime); return; } int c = comparator.compare(starts[size-1], start); // Fast path if we add in sorted order if (c <= 0) { // Note that we may still overlap the last range insertFrom(size-1, start, end, markedAt, delTime); } else { int pos = Arrays.binarySearch(starts, 0, size, start, comparator); if (pos >= 0) insertFrom(pos, start, end, markedAt, delTime); else // Insertion point (-pos-1) is such start < start[-pos-1], so we should insert from the previous insertFrom(-pos-2, start, end, markedAt, delTime); } } /** * Adds all the range tombstones of {@code tombstones} to this RangeTombstoneList. */ public void addAll(RangeTombstoneList tombstones) { if (tombstones.isEmpty()) return; if (isEmpty()) { copyArrays(tombstones, this); return; } /* * We basically have 2 techniques we can use here: either we repeatedly call add() on tombstones values, * or we do a merge of both (sorted) lists. If this lists is bigger enough than the one we add, the * calling add() will be faster, otherwise it's merging that will be faster. * * Let's note that during memtables updates, it might not be uncommon that a new update has only a few range * tombstones, while the CF we're adding it to (the on in the memtable) has many. In that case, using add() is * likely going to be faster. * * In other cases however, like when diffing responses from multiple nodes, the tombstone lists we "merge" will * be likely sized, so using add() might be a bit inefficient. * * Roughly speaking (this ignore the fact that updating an element is not exactly constant but that's not a big * deal), if n is the size of this list and m is tombstones size, merging is O(n+m) while using add() is O(m*log(n)). * * But let's not crank up a logarithm computation for that. Long story short, merging will be a bad choice only * if this list size is lot bigger that the other one, so let's keep it simple. */ if (size > 10 * tombstones.size) { for (int i = 0; i < tombstones.size; i++) add(tombstones.starts[i], tombstones.ends[i], tombstones.markedAts[i], tombstones.delTimes[i]); } else { int i = 0; int j = 0; while (i < size && j < tombstones.size) { if (comparator.compare(tombstones.starts[j], starts[i]) < 0) { insertFrom(i-1, tombstones.starts[j], tombstones.ends[j], tombstones.markedAts[j], tombstones.delTimes[j]); j++; } else { i++; } } // Addds the remaining ones from tombstones if any (not that insertFrom will increment size if relevant). for (; j < tombstones.size; j++) insertFrom(size - 1, tombstones.starts[j], tombstones.ends[j], tombstones.markedAts[j], tombstones.delTimes[j]); } } /** * Returns whether the given name/timestamp pair is deleted by one of the tombstone * of this RangeTombstoneList. */ public boolean isDeleted(ByteBuffer name, long timestamp) { int idx = searchInternal(name); return idx >= 0 && markedAts[idx] >= timestamp; } /** * Returns a new {@link InOrderTester}. */ InOrderTester inOrderTester() { return new InOrderTester(); } /** * Returns the DeletionTime for the tombstone overlapping {@code name} (there can't be more than one), * or null if {@code name} is not covered by any tombstone. */ public DeletionTime search(ByteBuffer name) { int idx = searchInternal(name); return idx < 0 ? null : new DeletionTime(markedAts[idx], delTimes[idx]); } private int searchInternal(ByteBuffer name) { if (isEmpty()) return -1; int pos = Arrays.binarySearch(starts, 0, size, name, comparator); if (pos >= 0) { // We're exactly on an interval start. The one subtility is that we need to check if // the previous is not equal to us and doesn't have a higher marked at if (pos > 0 && comparator.compare(name, ends[pos-1]) == 0 && markedAts[pos-1] > markedAts[pos]) return pos-1; else return pos; } else { // We potentially intersect the range before our "insertion point" int idx = -pos-2; if (idx < 0) return -1; return comparator.compare(name, ends[idx]) <= 0 ? idx : -1; } } public int dataSize() { int dataSize = TypeSizes.NATIVE.sizeof(size); for (int i = 0; i < size; i++) { dataSize += starts[i].remaining() + ends[i].remaining(); dataSize += TypeSizes.NATIVE.sizeof(markedAts[i]); dataSize += TypeSizes.NATIVE.sizeof(delTimes[i]); } return dataSize; } public long minMarkedAt() { long min = Long.MAX_VALUE; for (int i = 0; i < size; i++) min = Math.min(min, markedAts[i]); return min; } public long maxMarkedAt() { long max = Long.MIN_VALUE; for (int i = 0; i < size; i++) max = Math.max(max, markedAts[i]); return max; } public void updateAllTimestamp(long timestamp) { for (int i = 0; i < size; i++) markedAts[i] = timestamp; } /** * Removes all range tombstones whose local deletion time is older than gcBefore. */ public void purge(int gcBefore) { int j = 0; for (int i = 0; i < size; i++) { if (delTimes[i] >= gcBefore) setInternal(j++, starts[i], ends[i], markedAts[i], delTimes[i]); } size = j; } /** * Returns whether {@code purge(gcBefore)} would remove something or not. */ public boolean hasIrrelevantData(int gcBefore) { for (int i = 0; i < size; i++) { if (delTimes[i] < gcBefore) return true; } return false; } public Iterator<RangeTombstone> iterator() { return new AbstractIterator<RangeTombstone>() { private int idx; protected RangeTombstone computeNext() { if (idx >= size) return endOfData(); RangeTombstone t = new RangeTombstone(starts[idx], ends[idx], markedAts[idx], delTimes[idx]); idx++; return t; } }; } @Override public boolean equals(Object o) { if(!(o instanceof RangeTombstoneList)) return false; RangeTombstoneList that = (RangeTombstoneList)o; if (size != that.size) return false; for (int i = 0; i < size; i++) { if (!starts[i].equals(that.starts[i])) return false; if (!ends[i].equals(that.ends[i])) return false; if (markedAts[i] != that.markedAts[i]) return false; if (delTimes[i] != that.delTimes[i]) return false; } return true; } @Override public final int hashCode() { int result = size; for (int i = 0; i < size; i++) { result += starts[i].hashCode() + ends[i].hashCode(); result += (int)(markedAts[i] ^ (markedAts[i] >>> 32)); result += delTimes[i]; } return result; } private static void copyArrays(RangeTombstoneList src, RangeTombstoneList dst) { dst.grow(src.size); System.arraycopy(src.starts, 0, dst.starts, 0, src.size); System.arraycopy(src.ends, 0, dst.ends, 0, src.size); System.arraycopy(src.markedAts, 0, dst.markedAts, 0, src.size); System.arraycopy(src.delTimes, 0, dst.delTimes, 0, src.size); dst.size = src.size; } /* * Inserts a new element whose start should be inserted at index i. This method * assumes that: * - starts[i] <= start * - start < starts[i+1] or there is no i+1 element. */ private void insertFrom(int i, ByteBuffer start, ByteBuffer end, long markedAt, int delTime) { if (i < 0) { insertAfter(i, start, end, markedAt, delTime); return; } /* * We have elt(i) = [s_i, e_i]@t_i and want to insert X = [s, e]@t, knowing that s_i < s < s_i+1. * We can have 3 cases: * - s < e_i && e <= e_i: we're fully contained in i. * - s < e_i && e > e_i: we rewrite X to X1=[s, e_i]@t + X2=[e_i, e]@t. X1 is fully contained * in i and X2 is the insertAfter() case for i. * - s >= e_i: we're in the insertAfter() case for i. */ if (comparator.compare(start, ends[i]) < 0) { if (comparator.compare(end, ends[i]) <= 0) { update(i, start, end, markedAt, delTime); } else { insertAfter(i, ends[i], end, markedAt, delTime); update(i, start, ends[i], markedAt, delTime); } } else { insertAfter(i, start, end, markedAt, delTime); } } /* * Inserts a new element knowing that the new element start strictly after * the one at index i, i.e that: * - ends[i] <= start (or i == -1) */ private void insertAfter(int i, ByteBuffer start, ByteBuffer end, long markedAt, int delTime) { if (i == size - 1) { addInternal(i+1, start, end, markedAt, delTime); return; } /* * We have the following intervals: * i i+1 * ..., [s1, e1]@t1, [s2, e2]@t2, ... * * And we want to insert X = [s, e]@t, knowing that e1 <= s. * We can have 2 cases: * - s < s2: we rewrite X to X1=[s, s2]@t + X2=[s2, e]@t. X2 meet the weakInsertFrom() condition * for i+1, and X1 is a new element between i and i+1. * - s2 <= s: we're in the weakInsertFrom() case for i+1. */ if (comparator.compare(start, starts[i+1]) < 0) { /* * If it happens the new element is fully before the current one, we insert it and * we're done */ if (comparator.compare(end, starts[i+1]) <= 0) { addInternal(i+1, start, end, markedAt, delTime); return; } weakInsertFrom(i+1, starts[i+1], end, markedAt, delTime); addInternal(i+1, start, starts[i+1], markedAt, delTime); } else { weakInsertFrom(i+1, start, end, markedAt, delTime); } } /* * Weak version of insertFrom that only assumes the new element starts after index i, * but without knowing about the 2nd condition, i.e. this only assume that: * - starts[i] <= start */ private void weakInsertFrom(int i, ByteBuffer start, ByteBuffer end, long markedAt, int delTime) { /* * Either start is before the next element start, and we're in fact in the insertFrom() * case, or it's not and it's an weakInsertFrom for the next index. */ if (i == size - 1 || comparator.compare(start, starts[i+1]) < 0) insertFrom(i, start, end, markedAt, delTime); else weakInsertFrom(i+1, start, end, markedAt, delTime); } /* * Update index i with new element, assuming that new element is contained in the element i, * i.e that: * - starts[i] <= s * - e <= end[i] */ private void update(int i, ByteBuffer start, ByteBuffer end, long markedAt, int delTime) { /* * If the new markedAt is lower than the one of i, we can ignore the * new element, otherwise we split the current element. */ if (markedAts[i] < markedAt) { if (comparator.compare(ends[i], end) != 0) addInternal(i+1, end, ends[i], markedAts[i], delTimes[i]); if (comparator.compare(starts[i], start) == 0) { markedAts[i] = markedAt; delTimes[i] = delTime; ends[i] = end; } else { addInternal(i+1, start, end, markedAt, delTime); ends[i] = start; } } } private int capacity() { return starts.length; } /* * Adds the new tombstone at index i, growing and/or moving elements to make room for it. */ private void addInternal(int i, ByteBuffer start, ByteBuffer end, long markedAt, int delTime) { assert i >= 0; if (size == capacity()) growToFree(i); else if (i < size) moveElements(i); setInternal(i, start, end, markedAt, delTime); size++; } /* * Grow the arrays, leaving index i "free" in the process. */ private void growToFree(int i) { int newLength = (capacity() * 3) / 2 + 1; grow(i, newLength); } /* * Grow the arrays to match newLength capacity. */ private void grow(int newLength) { if (capacity() < newLength) grow(-1, newLength); } private void grow(int i, int newLength) { starts = grow(starts, size, newLength, i); ends = grow(ends, size, newLength, i); markedAts = grow(markedAts, size, newLength, i); delTimes = grow(delTimes, size, newLength, i); } private static ByteBuffer[] grow(ByteBuffer[] a, int size, int newLength, int i) { if (i < 0 || i >= size) return Arrays.copyOf(a, newLength); ByteBuffer[] newA = new ByteBuffer[newLength]; System.arraycopy(a, 0, newA, 0, i); System.arraycopy(a, i, newA, i+1, size - i); return newA; } private static long[] grow(long[] a, int size, int newLength, int i) { if (i < 0 || i >= size) return Arrays.copyOf(a, newLength); long[] newA = new long[newLength]; System.arraycopy(a, 0, newA, 0, i); System.arraycopy(a, i, newA, i+1, size - i); return newA; } private static int[] grow(int[] a, int size, int newLength, int i) { if (i < 0 || i >= size) return Arrays.copyOf(a, newLength); int[] newA = new int[newLength]; System.arraycopy(a, 0, newA, 0, i); System.arraycopy(a, i, newA, i+1, size - i); return newA; } /* * Move elements so that index i is "free", assuming the arrays have at least one free slot at the end. */ private void moveElements(int i) { if (i >= size) return; System.arraycopy(starts, i, starts, i+1, size - i); System.arraycopy(ends, i, ends, i+1, size - i); System.arraycopy(markedAts, i, markedAts, i+1, size - i); System.arraycopy(delTimes, i, delTimes, i+1, size - i); } private void setInternal(int i, ByteBuffer start, ByteBuffer end, long markedAt, int delTime) { starts[i] = start; ends[i] = end; markedAts[i] = markedAt; delTimes[i] = delTime; } public static class Serializer implements IVersionedSerializer<RangeTombstoneList> { private Serializer() {} public void serialize(RangeTombstoneList tombstones, DataOutput out, int version) throws IOException { if (tombstones == null) { out.writeInt(0); return; } out.writeInt(tombstones.size); for (int i = 0; i < tombstones.size; i++) { ByteBufferUtil.writeWithShortLength(tombstones.starts[i], out); ByteBufferUtil.writeWithShortLength(tombstones.ends[i], out); out.writeInt(tombstones.delTimes[i]); out.writeLong(tombstones.markedAts[i]); } } /* * RangeTombstoneList depends on the column family comparator, but it is not serialized. * Thus deserialize(DataInput, int, Comparator<ByteBuffer>) should be used instead of this method. */ public RangeTombstoneList deserialize(DataInput in, int version) throws IOException { throw new UnsupportedOperationException(); } public RangeTombstoneList deserialize(DataInput in, int version, Comparator<ByteBuffer> comparator) throws IOException { int size = in.readInt(); if (size == 0) return null; RangeTombstoneList tombstones = new RangeTombstoneList(comparator, size); for (int i = 0; i < size; i++) { ByteBuffer start = ByteBufferUtil.readWithShortLength(in); ByteBuffer end = ByteBufferUtil.readWithShortLength(in); int delTime = in.readInt(); long markedAt = in.readLong(); if (version >= MessagingService.VERSION_20) { tombstones.setInternal(i, start, end, markedAt, delTime); } else { /* * The old implementation used to have range sorted by left value, but with potentially * overlapping range. So we need to use the "slow" path. */ tombstones.add(start, end, markedAt, delTime); } } // The "slow" path take care of updating the size, but not the fast one if (version >= MessagingService.VERSION_20) tombstones.size = size; return tombstones; } public long serializedSize(RangeTombstoneList tombstones, TypeSizes typeSizes, int version) { if (tombstones == null) return typeSizes.sizeof(0); long size = typeSizes.sizeof(tombstones.size); for (int i = 0; i < tombstones.size; i++) { int startSize = tombstones.starts[i].remaining(); size += typeSizes.sizeof((short)startSize) + startSize; int endSize = tombstones.ends[i].remaining(); size += typeSizes.sizeof((short)endSize) + endSize; size += typeSizes.sizeof(tombstones.delTimes[i]); size += typeSizes.sizeof(tombstones.markedAts[i]); } return size; } public long serializedSize(RangeTombstoneList tombstones, int version) { return serializedSize(tombstones, TypeSizes.NATIVE, version); } } /** * This object allow testing whether a given column (name/timestamp) is deleted * or not by this RangeTombstoneList, assuming that the column given to this * object are passed in (comparator) sorted order. * * This is more efficient that calling RangeTombstoneList.isDeleted() repeatedly * in that case since we're able to take the sorted nature of the RangeTombstoneList * into account. */ public class InOrderTester { private int idx; public boolean isDeleted(ByteBuffer name, long timestamp) { while (idx < size) { int cmp = comparator.compare(name, starts[idx]); if (cmp == 0) { // As for searchInternal, we need to check the previous end if (idx > 0 && comparator.compare(name, ends[idx-1]) == 0 && markedAts[idx-1] > markedAts[idx]) return markedAts[idx-1] >= timestamp; else return markedAts[idx] >= timestamp; } else if (cmp < 0) { return false; } else { if (comparator.compare(name, ends[idx]) <= 0) return markedAts[idx] >= timestamp; else idx++; } } return false; } } }