/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.cassandra.utils; import java.io.Serializable; import java.util.*; import com.google.common.collect.AbstractIterator; import com.google.common.collect.PeekingIterator; import org.apache.cassandra.dht.IPartitioner; import org.apache.cassandra.dht.Range; import org.apache.cassandra.dht.Token; /** * A MerkleTree implemented as a binary tree. * * A MerkleTree is a full binary tree that represents a perfect binary tree of * depth 'hashdepth'. In a perfect binary tree, each leaf contains a * sequentially hashed range, and each inner node contains the binary hash of * its two children. In the MerkleTree, many ranges will not be split to the * full depth of the perfect binary tree: the leaves of this tree are Leaf objects, * which contain the computed values of the nodes that would be below them if * the tree were perfect. * * The hash values of the inner nodes of the MerkleTree are calculated lazily based * on their children when the hash of a range is requested with hash(range). * * Inputs passed to TreeRange.validate should be calculated using a very secure hash, * because all hashing internal to the tree is accomplished using XOR. * * If two MerkleTrees have the same hashdepth, they represent a perfect tree * of the same depth, and can always be compared, regardless of size or splits. */ public class MerkleTree implements Serializable { private static final long serialVersionUID = 2L; public static final byte RECOMMENDED_DEPTH = Byte.MAX_VALUE - 1; public static final int CONSISTENT = 0; public static final int FULLY_INCONSISTENT = 1; public static final int PARTIALLY_INCONSISTENT = 2; public final byte hashdepth; private transient IPartitioner partitioner; private long maxsize; private long size; private Hashable root; /** * @param partitioner The partitioner in use. * @param hashdepth The maximum depth of the tree. 100/(2^depth) is the % * of the key space covered by each subrange of a fully populated tree. * @param maxsize The maximum number of subranges in the tree. */ public MerkleTree(IPartitioner partitioner, byte hashdepth, long maxsize) { assert hashdepth < Byte.MAX_VALUE; this.partitioner = partitioner; this.hashdepth = hashdepth; this.maxsize = maxsize; size = 1; root = new Leaf(null); } static byte inc(byte in) { assert in < Byte.MAX_VALUE; return (byte)(in + 1); } /** * Initializes this tree by splitting it until hashdepth is reached, * or until an additional level of splits would violate maxsize. * * NB: Replaces all nodes in the tree. */ public void init() { // determine the depth to which we can safely split the tree byte sizedepth = (byte)(Math.log10(maxsize) / Math.log10(2)); byte depth = (byte)Math.min(sizedepth, hashdepth); Token mintoken = partitioner.getMinimumToken(); root = initHelper(mintoken, mintoken, (byte)0, depth); size = (long)Math.pow(2, depth); } private Hashable initHelper(Token left, Token right, byte depth, byte max) { if (depth == max) // we've reached the leaves return new Leaf(); Token midpoint = partitioner.midpoint(left, right); Hashable lchild = initHelper(left, midpoint, inc(depth), max); Hashable rchild = initHelper(midpoint, right, inc(depth), max); return new Inner(midpoint, lchild, rchild); } Hashable root() { return root; } public IPartitioner partitioner() { return partitioner; } /** * The number of distinct ranges contained in this tree. This is a reasonable * measure of the memory usage of the tree (assuming 'this.order' is significant). */ public long size() { return size; } public long maxsize() { return maxsize; } public void maxsize(long maxsize) { this.maxsize = maxsize; } /** * TODO: Find another way to use the local partitioner after serialization. */ public void partitioner(IPartitioner partitioner) { this.partitioner = partitioner; } /** * @param ltree First tree. * @param rtree Second tree. * @return A list of the largest contiguous ranges where the given trees disagree. */ public static List<TreeRange> difference(MerkleTree ltree, MerkleTree rtree) { List<TreeRange> diff = new ArrayList<TreeRange>(); Token mintoken = ltree.partitioner.getMinimumToken(); TreeRange active = new TreeRange(null, mintoken, mintoken, (byte)0, null); byte[] lhash = ltree.hash(active); byte[] rhash = rtree.hash(active); if (lhash != null && rhash != null && !Arrays.equals(lhash, rhash)) { if (FULLY_INCONSISTENT == differenceHelper(ltree, rtree, diff, active)) diff.add(active); } else if (lhash == null || rhash == null) diff.add(active); return diff; } /** * TODO: This function could be optimized into a depth first traversal of * the two trees in parallel. * * Takes two trees and a range for which they have hashes, but are inconsistent. * @return FULLY_INCONSISTENT if active is inconsistent, PARTIALLY_INCONSISTENT if only a subrange is inconsistent. */ static int differenceHelper(MerkleTree ltree, MerkleTree rtree, List<TreeRange> diff, TreeRange active) { Token midpoint = ltree.partitioner().midpoint(active.left, active.right); TreeRange left = new TreeRange(null, active.left, midpoint, inc(active.depth), null); TreeRange right = new TreeRange(null, midpoint, active.right, inc(active.depth), null); byte[] lhash; byte[] rhash; // see if we should recurse left lhash = ltree.hash(left); rhash = rtree.hash(left); int ldiff = CONSISTENT; boolean lreso = lhash != null && rhash != null; if (lreso && !Arrays.equals(lhash, rhash)) ldiff = differenceHelper(ltree, rtree, diff, left); else if (!lreso) ldiff = FULLY_INCONSISTENT; // see if we should recurse right lhash = ltree.hash(right); rhash = rtree.hash(right); int rdiff = CONSISTENT; boolean rreso = lhash != null && rhash != null; if (rreso && !Arrays.equals(lhash, rhash)) rdiff = differenceHelper(ltree, rtree, diff, right); else if (!rreso) rdiff = FULLY_INCONSISTENT; if (ldiff == FULLY_INCONSISTENT && rdiff == FULLY_INCONSISTENT) { // both children are fully inconsistent return FULLY_INCONSISTENT; } else if (ldiff == FULLY_INCONSISTENT) { diff.add(left); return PARTIALLY_INCONSISTENT; } else if (rdiff == FULLY_INCONSISTENT) { diff.add(right); return PARTIALLY_INCONSISTENT; } return PARTIALLY_INCONSISTENT; } /** * For testing purposes. * Gets the smallest range containing the token. */ TreeRange get(Token t) { Token mintoken = partitioner.getMinimumToken(); return getHelper(root, mintoken, mintoken, (byte)0, t); } TreeRange getHelper(Hashable hashable, Token pleft, Token pright, byte depth, Token t) { if (hashable instanceof Leaf) { // we've reached a hash: wrap it up and deliver it return new TreeRange(this, pleft, pright, depth, hashable); } // else: node. Inner node = (Inner)hashable; if (Range.contains(pleft, node.token, t)) // left child contains token return getHelper(node.lchild, pleft, node.token, inc(depth), t); // else: right child contains token return getHelper(node.rchild, node.token, pright, inc(depth), t); } /** * Invalidates the ranges containing the given token. */ public void invalidate(Token t) { invalidateHelper(root, partitioner.getMinimumToken(), t); } private void invalidateHelper(Hashable hashable, Token pleft, Token t) { hashable.hash(null); if (hashable instanceof Leaf) return; // else: node. Inner node = (Inner)hashable; if (Range.contains(pleft, node.token, t)) // left child contains token invalidateHelper(node.lchild, pleft, t); else // right child contains token invalidateHelper(node.rchild, node.token, t); } /** * Hash the given range in the tree. The range must have been generated * with recursive applications of partitioner.midpoint(). * * NB: Currently does not support wrapping ranges that do not end with * partitioner.getMinimumToken(). * * @return Null if any subrange of the range is invalid, or if the exact * range cannot be calculated using this tree. */ public byte[] hash(Range range) { Token mintoken = partitioner.getMinimumToken(); try { return hashHelper(root, new Range(mintoken, mintoken), range); } catch (StopRecursion e) { return null; } } /** * @throws StopRecursion If no match could be found for the range. */ private byte[] hashHelper(Hashable hashable, Range active, Range range) throws StopRecursion { if (hashable instanceof Leaf) { if (!range.contains(active)) // we are not fully contained in this range! throw new StopRecursion.BadRange(); return hashable.hash(); } // else: node. Inner node = (Inner)hashable; Range leftactive = new Range(active.left, node.token); Range rightactive = new Range(node.token, active.right); if (range.contains(active)) { // this node is fully contained in the range if (node.hash() != null) // we had a cached value return node.hash(); // continue recursing to hash our children byte[] lhash = hashHelper(node.lchild(), leftactive, range); byte[] rhash = hashHelper(node.rchild(), rightactive, range); // cache the computed value (even if it is null) node.hash(lhash, rhash); return node.hash(); } // else: one of our children contains the range if (leftactive.contains(range)) // left child contains/matches the range return hashHelper(node.lchild, leftactive, range); else if (rightactive.contains(range)) // right child contains/matches the range return hashHelper(node.rchild, rightactive, range); else throw new StopRecursion.BadRange(); } /** * Splits the range containing the given token, if no tree limits would be * violated. If the range would be split to a depth below hashdepth, or if * the tree already contains maxsize subranges, this operation will fail. * * @return True if the range was successfully split. */ public boolean split(Token t) { if (!(size < maxsize)) return false; Token mintoken = partitioner.getMinimumToken(); try { root = splitHelper(root, mintoken, mintoken, (byte)0, t); } catch (StopRecursion.TooDeep e) { return false; } return true; } private Hashable splitHelper(Hashable hashable, Token pleft, Token pright, byte depth, Token t) throws StopRecursion.TooDeep { if (depth >= hashdepth) throw new StopRecursion.TooDeep(); if (hashable instanceof Leaf) { // split size++; Token midpoint = partitioner.midpoint(pleft, pright); return new Inner(midpoint, new Leaf(), new Leaf()); } // else: node. // recurse on the matching child Inner node = (Inner)hashable; if (Range.contains(pleft, node.token, t)) // left child contains token node.lchild(splitHelper(node.lchild, pleft, node.token, inc(depth), t)); else // else: right child contains token node.rchild(splitHelper(node.rchild, node.token, pright, inc(depth), t)); return node; } /** * Compacts the smallest subranges evenly split by the given token into a * single range. * * Asserts that the given Token falls between two compactable subranges. */ public void compact(Token t) { root = compactHelper(root, t); } private Hashable compactHelper(Hashable hashable, Token t) { // we reached a Leaf without finding an Inner to compact assert !(hashable instanceof Leaf); Inner node = (Inner)hashable; int comp = t.compareTo(node.token); if (comp == 0) { // this is the node to compact assert node.lchild() instanceof Leaf && node.rchild() instanceof Leaf : "Can only compact a subrange evenly split by the given token!"; // hash our children together into a new value to replace ourself size--; return new Leaf(node.lchild().hash(), node.rchild().hash()); } else if (comp < 0) // recurse to the left node.lchild(compactHelper(node.lchild(), t)); else // recurse to the right node.rchild(compactHelper(node.rchild(), t)); return node; } /** * Returns a lazy iterator of invalid TreeRanges that need to be filled * in order to make the given Range valid. * * @param range The range to find invalid subranges for. */ public TreeRangeIterator invalids(Range range) { return new TreeRangeIterator(this, range); } @Override public String toString() { StringBuilder buff = new StringBuilder(); buff.append("#<MerkleTree root="); root.toString(buff, 8); buff.append(">"); return buff.toString(); } /** * The public interface to a range in the tree. * * NB: A TreeRange should not be returned by a public method unless the * parents of the range it represents are already invalidated, since it * will allow someone to modify the hash. Alternatively, a TreeRange * may be created with a null tree, indicating that it is read only. */ public static class TreeRange extends Range { public static final long serialVersionUID = 1L; private final MerkleTree tree; public final byte depth; private final Hashable hashable; TreeRange(MerkleTree tree, Token left, Token right, byte depth, Hashable hashable) { super(left, right); this.tree = tree; this.depth = depth; this.hashable = hashable; } public void hash(byte[] hash) { assert tree != null : "Not intended for modification!"; hashable.hash(hash); } public byte[] hash() { return hashable.hash(); } /** * @param entry Row to mix into the hash for this range. */ public void addHash(RowHash entry) { assert tree != null : "Not intended for modification!"; assert hashable instanceof Leaf; hashable.addHash(entry.hash); } public void addAll(Iterator<RowHash> entries) { while (entries.hasNext()) addHash(entries.next()); } @Override public String toString() { StringBuilder buff = new StringBuilder("#<TreeRange "); buff.append(super.toString()).append(" depth=").append(depth); return buff.append(">").toString(); } } /** * Performs a depth-first, inorder traversal of invalid nodes under the given root * and intersecting the given range. */ public static class TreeRangeIterator extends AbstractIterator<TreeRange> implements Iterable<TreeRange>, PeekingIterator<TreeRange> { // stack of ranges to visit private final ArrayDeque<TreeRange> tovisit; // interesting range private final Range range; private final MerkleTree tree; TreeRangeIterator(MerkleTree tree, Range range) { Token mintoken = tree.partitioner().getMinimumToken(); tovisit = new ArrayDeque<TreeRange>(); tovisit.add(new TreeRange(tree, mintoken, mintoken, (byte)0, tree.root)); this.tree = tree; this.range = range; } /** * Find the next TreeRange. * * @return The next TreeRange. */ @Override public TreeRange computeNext() { while (!tovisit.isEmpty()) { TreeRange active = tovisit.pop(); if (active.hashable.hash() != null) // skip valid ranges continue; if (active.hashable instanceof Leaf) // found a leaf invalid range return active; Inner node = (Inner)active.hashable; // push intersecting children onto the stack TreeRange left = new TreeRange(tree, active.left, node.token, inc(active.depth), node.lchild); TreeRange right = new TreeRange(tree, node.token, active.right, inc(active.depth), node.rchild); if (right.intersects(range)) tovisit.push(right); if (left.intersects(range)) tovisit.push(left); } return endOfData(); } public Iterator<TreeRange> iterator() { return this; } } /** * An inner node in the MerkleTree. Inners can contain cached hash values, which * are the binary hash of their two children. */ static class Inner extends Hashable { public static final long serialVersionUID = 1L; public final Token token; private Hashable lchild; private Hashable rchild; /** * Constructs an Inner with the given token and children, and a null hash. */ public Inner(Token token, Hashable lchild, Hashable rchild) { super(null); this.token = token; this.lchild = lchild; this.rchild = rchild; } public Hashable lchild() { return lchild; } public Hashable rchild() { return rchild; } public void lchild(Hashable child) { lchild = child; } public void rchild(Hashable child) { rchild = child; } /** * Recursive toString. */ @Override public void toString(StringBuilder buff, int maxdepth) { buff.append("#<").append(getClass().getSimpleName()); buff.append(" ").append(token); buff.append(" hash=").append(Hashable.toString(hash())); buff.append(" children=["); if (maxdepth < 1) { buff.append("#"); } else { if (lchild == null) buff.append("null"); else lchild.toString(buff, maxdepth-1); buff.append(" "); if (rchild == null) buff.append("null"); else rchild.toString(buff, maxdepth-1); } buff.append("]>"); } @Override public String toString() { StringBuilder buff = new StringBuilder(); toString(buff, 1); return buff.toString(); } } /** * A leaf node in the MerkleTree. Because the MerkleTree represents a much * larger perfect binary tree of depth hashdepth, a Leaf object contains * the value that would be contained in the perfect tree at its position. * * When rows are added to the MerkleTree using TreeRange.validate(), the * tree extending below the Leaf is generated in memory, but only the root * is stored in the Leaf. */ static class Leaf extends Hashable { public static final long serialVersionUID = 1L; /** * Constructs a null hash. */ public Leaf() { super(null); } public Leaf(byte[] hash) { super(hash); } public Leaf(byte[] lefthash, byte[] righthash) { super(Hashable.binaryHash(lefthash, righthash)); } @Override public void toString(StringBuilder buff, int maxdepth) { buff.append(toString()); } @Override public String toString() { return "#<Leaf " + Hashable.toString(hash()) + ">"; } } /** * Hash value representing a row, to be used to pass hashes to the MerkleTree. * The byte[] hash value should contain a digest of the key and value of the row * created using a very strong hash function. */ public static class RowHash { public final Token token; public final byte[] hash; public RowHash(Token token, byte[] hash) { this.token = token; this.hash = hash; } @Override public String toString() { return "#<RowHash " + token + " " + Hashable.toString(hash) + ">"; } } /** * Abstract class containing hashing logic, and containing a single hash field. */ static abstract class Hashable implements Serializable { private static final long serialVersionUID = 1L; protected byte[] hash; protected Hashable(byte[] hash) { this.hash = hash; } public byte[] hash() { return hash; } void hash(byte[] hash) { this.hash = hash; } /** * Sets the value of this hash to binaryHash of its children. * @param lefthash Hash of left child. * @param righthash Hash of right child. */ void hash(byte[] lefthash, byte[] righthash) { hash = binaryHash(lefthash, righthash); } /** * Mixes the given value into our hash. If our hash is null, * our hash will become the given value. */ void addHash(byte[] righthash) { if (hash == null) hash = righthash; else hash = binaryHash(hash, righthash); } /** * The primitive with which all hashing should be accomplished: hashes * a left and right value together. */ static byte[] binaryHash(final byte[] left, final byte[] right) { return FBUtilities.xor(left, right); } public abstract void toString(StringBuilder buff, int maxdepth); public static String toString(byte[] hash) { if (hash == null) return "null"; return "[" + FBUtilities.bytesToHex(hash) + "]"; } } /** * Exceptions that stop recursion early when we are sure that no answer * can be found. */ static abstract class StopRecursion extends Exception { static class BadRange extends StopRecursion { public BadRange(){ super(); } } static class InvalidHash extends StopRecursion { public InvalidHash(){ super(); } } static class TooDeep extends StopRecursion { public TooDeep(){ super(); } } } }