/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.cassandra.dht.tokenallocator; import java.util.*; import com.google.common.collect.HashMultimap; import com.google.common.collect.ImmutableList; import com.google.common.collect.Maps; import com.google.common.collect.Multimap; import org.apache.cassandra.dht.IPartitioner; import org.apache.cassandra.dht.Token; /** * A Replication Aware allocator for tokens, that attempts to ensure an even distribution of ownership across * the known cluster for the provided replication strategy. * * A unit is shorthand for a "unit of ownership" which translates roughly to a node, or a disk on the node, * a CPU on the node, or some other relevant unit of ownership. These units should be the lowest rung over which * ownership needs to be evenly distributed. At the moment only nodes as a whole are treated as units, but that * will change with the introduction of token ranges per disk. */ class ReplicationAwareTokenAllocator<Unit> extends TokenAllocatorBase<Unit> { final Multimap<Unit, Token> unitToTokens; final int replicas; ReplicationAwareTokenAllocator(NavigableMap<Token, Unit> sortedTokens, ReplicationStrategy<Unit> strategy, IPartitioner partitioner) { super(sortedTokens, strategy, partitioner); unitToTokens = HashMultimap.create(); for (Map.Entry<Token, Unit> en : sortedTokens.entrySet()) unitToTokens.put(en.getValue(), en.getKey()); this.replicas = strategy.replicas(); } public int getReplicas() { return replicas; } public Collection<Token> addUnit(Unit newUnit, int numTokens) { assert !unitToTokens.containsKey(newUnit); if (unitCount() < replicas) // Allocation does not matter; everything replicates everywhere. return generateRandomTokens(newUnit, numTokens); if (numTokens > sortedTokens.size()) // Some of the heuristics below can't deal with this case. Use random for now, later allocations can fix any problems this may cause. return generateRandomTokens(newUnit, numTokens); // ============= construct our initial token ring state ============= double optTokenOwnership = optimalTokenOwnership(numTokens); Map<Object, GroupInfo> groups = Maps.newHashMap(); Map<Unit, UnitInfo<Unit>> unitInfos = createUnitInfos(groups); if (groups.size() < replicas) { // We need at least replicas groups to do allocation correctly. If there aren't enough, // use random allocation. // This part of the code should only be reached via the RATATest. StrategyAdapter should disallow // token allocation in this case as the algorithm is not able to cover the behavior of NetworkTopologyStrategy. return generateRandomTokens(newUnit, numTokens); } // initialise our new unit's state (with an idealised ownership) // strategy must already know about this unit UnitInfo<Unit> newUnitInfo = new UnitInfo<>(newUnit, numTokens * optTokenOwnership, groups, strategy); // build the current token ring state TokenInfo<Unit> tokens = createTokenInfos(unitInfos, newUnitInfo.group); newUnitInfo.tokenCount = numTokens; // ============= construct and rank our candidate token allocations ============= // walk the token ring, constructing the set of candidates in ring order // as the midpoints between all existing tokens CandidateInfo<Unit> candidates = createCandidates(tokens, newUnitInfo, optTokenOwnership); // Evaluate the expected improvements from all candidates and form a priority queue. PriorityQueue<Weighted<CandidateInfo<Unit>>> improvements = new PriorityQueue<>(sortedTokens.size()); CandidateInfo<Unit> candidate = candidates; do { double impr = evaluateImprovement(candidate, optTokenOwnership, 1.0 / numTokens); improvements.add(new Weighted<>(impr, candidate)); candidate = candidate.next; } while (candidate != candidates); // ============= iteratively take the best candidate, and re-rank ============= CandidateInfo<Unit> bestToken = improvements.remove().value; for (int vn = 1; ; ++vn) { candidates = bestToken.removeFrom(candidates); confirmCandidate(bestToken); if (vn == numTokens) break; while (true) { // Get the next candidate in the queue. Its improvement may have changed (esp. if multiple tokens // were good suggestions because they could improve the same problem)-- evaluate it again to check // if it is still a good candidate. bestToken = improvements.remove().value; double impr = evaluateImprovement(bestToken, optTokenOwnership, (vn + 1.0) / numTokens); Weighted<CandidateInfo<Unit>> next = improvements.peek(); // If it is better than the next in the queue, it is good enough. This is a heuristic that doesn't // get the best results, but works well enough and on average cuts search time by a factor of O(vnodes). if (next == null || impr >= next.weight) break; improvements.add(new Weighted<>(impr, bestToken)); } } return ImmutableList.copyOf(unitToTokens.get(newUnit)); } private Collection<Token> generateRandomTokens(Unit newUnit, int numTokens) { Set<Token> tokens = new HashSet<>(numTokens); while (tokens.size() < numTokens) { Token token = partitioner.getRandomToken(); if (!sortedTokens.containsKey(token)) { tokens.add(token); sortedTokens.put(token, newUnit); unitToTokens.put(newUnit, token); } } return tokens; } /** * Construct the token ring as a CircularList of TokenInfo, * and populate the ownership of the UnitInfo's provided */ private TokenInfo<Unit> createTokenInfos(Map<Unit, UnitInfo<Unit>> units, GroupInfo newUnitGroup) { // build the circular list TokenInfo<Unit> prev = null; TokenInfo<Unit> first = null; for (Map.Entry<Token, Unit> en : sortedTokens.entrySet()) { Token t = en.getKey(); UnitInfo<Unit> ni = units.get(en.getValue()); TokenInfo<Unit> ti = new TokenInfo<>(t, ni); first = ti.insertAfter(first, prev); prev = ti; } TokenInfo<Unit> curr = first; do { populateTokenInfoAndAdjustUnit(curr, newUnitGroup); curr = curr.next; } while (curr != first); return first; } private CandidateInfo<Unit> createCandidates(TokenInfo<Unit> tokens, UnitInfo<Unit> newUnitInfo, double initialTokenOwnership) { TokenInfo<Unit> curr = tokens; CandidateInfo<Unit> first = null; CandidateInfo<Unit> prev = null; do { CandidateInfo<Unit> candidate = new CandidateInfo<Unit>(partitioner.midpoint(curr.prev.token, curr.token), curr, newUnitInfo); first = candidate.insertAfter(first, prev); candidate.replicatedOwnership = initialTokenOwnership; populateCandidate(candidate); prev = candidate; curr = curr.next; } while (curr != tokens); prev.next = first; return first; } private void populateCandidate(CandidateInfo<Unit> candidate) { // Only finding replication start would do. populateTokenInfo(candidate, candidate.owningUnit.group); } /** * Incorporates the selected candidate into the ring, adjusting ownership information and calculated token * information. */ private void confirmCandidate(CandidateInfo<Unit> candidate) { // This process is less efficient than it could be (loops through each vnode's replication span instead // of recalculating replicationStart, replicationThreshold from existing data + new token data in an O(1) // case analysis similar to evaluateImprovement). This is fine as the method does not dominate processing // time. // Put the accepted candidate in the token list. UnitInfo<Unit> newUnit = candidate.owningUnit; Token newToken = candidate.token; sortedTokens.put(newToken, newUnit.unit); unitToTokens.put(newUnit.unit, newToken); TokenInfo<Unit> prev = candidate.prevInRing(); TokenInfo<Unit> newTokenInfo = new TokenInfo<>(newToken, newUnit); newTokenInfo.replicatedOwnership = candidate.replicatedOwnership; newTokenInfo.insertAfter(prev, prev); // List is not empty so this won't need to change head of list. // Update data for candidate. populateTokenInfoAndAdjustUnit(newTokenInfo, newUnit.group); ReplicationVisitor replicationVisitor = new ReplicationVisitor(); assert newTokenInfo.next == candidate.split; for (TokenInfo<Unit> curr = newTokenInfo.next; !replicationVisitor.visitedAll(); curr = curr.next) { // update the candidate between curr and next candidate = candidate.next; populateCandidate(candidate); if (!replicationVisitor.add(curr.owningUnit.group)) continue; // If we've already seen this group, the token cannot be affected. populateTokenInfoAndAdjustUnit(curr, newUnit.group); } replicationVisitor.clean(); } /** * Calculates the {@code replicationStart} of a token, as well as {@code replicationThreshold} which is chosen in a way * that permits {@code findUpdatedReplicationStart} to quickly identify changes in ownership. */ private Token populateTokenInfo(BaseTokenInfo<Unit, ?> token, GroupInfo newUnitGroup) { GroupInfo tokenGroup = token.owningUnit.group; PopulateVisitor visitor = new PopulateVisitor(); // Replication start = the end of a token from the RF'th different group seen before the token. Token replicationStart; // The end of a token from the RF-1'th different group seen before the token. Token replicationThreshold = token.token; GroupInfo currGroup; for (TokenInfo<Unit> curr = token.prevInRing(); ; curr = curr.prev) { replicationStart = curr.token; currGroup = curr.owningUnit.group; if (!visitor.add(currGroup)) continue; // Group is already seen. if (visitor.visitedAll()) break; replicationThreshold = replicationStart; // Another instance of the same group precedes us in the replication range of the ring, // so this is where our replication range begins if (currGroup == tokenGroup) break; } if (newUnitGroup == tokenGroup) // new token is always a boundary (as long as it's closer than replicationStart) replicationThreshold = token.token; else if (newUnitGroup != currGroup && visitor.seen(newUnitGroup)) // already has new group in replication span before last seen. cannot be affected replicationThreshold = replicationStart; visitor.clean(); token.replicationThreshold = replicationThreshold; token.replicationStart = replicationStart; return replicationStart; } private void populateTokenInfoAndAdjustUnit(TokenInfo<Unit> populate, GroupInfo newUnitGroup) { Token replicationStart = populateTokenInfo(populate, newUnitGroup); double newOwnership = replicationStart.size(populate.token); double oldOwnership = populate.replicatedOwnership; populate.replicatedOwnership = newOwnership; populate.owningUnit.ownership += newOwnership - oldOwnership; } /** * Evaluates the improvement in variance for both units and individual tokens when candidate is inserted into the * ring. */ private double evaluateImprovement(CandidateInfo<Unit> candidate, double optTokenOwnership, double newUnitMult) { double tokenChange = 0; UnitInfo<Unit> candidateUnit = candidate.owningUnit; Token candidateEnd = candidate.token; // Form a chain of units affected by the insertion to be able to qualify change of unit ownership. // A unit may be affected more than once. UnitAdjustmentTracker<Unit> unitTracker = new UnitAdjustmentTracker<>(candidateUnit); // Reflect change in ownership of the splitting token (candidate). tokenChange += applyOwnershipAdjustment(candidate, candidateUnit, candidate.replicationStart, candidateEnd, optTokenOwnership, unitTracker); // Loop through all vnodes that replicate candidate or split and update their ownership. ReplicationVisitor replicationVisitor = new ReplicationVisitor(); for (TokenInfo<Unit> curr = candidate.split; !replicationVisitor.visitedAll(); curr = curr.next) { UnitInfo<Unit> currUnit = curr.owningUnit; if (!replicationVisitor.add(currUnit.group)) continue; // If this group is already seen, the token cannot be affected. Token replicationEnd = curr.token; Token replicationStart = findUpdatedReplicationStart(curr, candidate); tokenChange += applyOwnershipAdjustment(curr, currUnit, replicationStart, replicationEnd, optTokenOwnership, unitTracker); } replicationVisitor.clean(); double nodeChange = unitTracker.calculateUnitChange(newUnitMult, optTokenOwnership); return -(tokenChange + nodeChange); } /** * Returns the start of the replication span for the token {@code curr} when {@code candidate} is inserted into the * ring. */ private Token findUpdatedReplicationStart(TokenInfo<Unit> curr, CandidateInfo<Unit> candidate) { return furtherStartToken(curr.replicationThreshold, candidate.token, curr.token); } /** * Applies the ownership adjustment for the given element, updating tracked unit ownership and returning the change * of variance. */ private double applyOwnershipAdjustment(BaseTokenInfo<Unit, ?> curr, UnitInfo<Unit> currUnit, Token replicationStart, Token replicationEnd, double optTokenOwnership, UnitAdjustmentTracker<Unit> unitTracker) { double oldOwnership = curr.replicatedOwnership; double newOwnership = replicationStart.size(replicationEnd); double tokenCount = currUnit.tokenCount; assert tokenCount > 0; unitTracker.add(currUnit, newOwnership - oldOwnership); return (sq(newOwnership - optTokenOwnership) - sq(oldOwnership - optTokenOwnership)) / sq(tokenCount); } /** * Tracker for unit ownership changes. The changes are tracked by a chain of UnitInfos where the adjustedOwnership * field is being updated as we see changes in token ownership. * * The chain ends with an element that points to itself; this element must be specified as argument to the * constructor as well as be the first unit with which 'add' is called; when calculating the variance change * a separate multiplier is applied to it (used to permit more freedom in choosing the first tokens of a unit). */ private static class UnitAdjustmentTracker<Unit> { UnitInfo<Unit> unitsChain; UnitAdjustmentTracker(UnitInfo<Unit> newUnit) { unitsChain = newUnit; } void add(UnitInfo<Unit> currUnit, double diff) { if (currUnit.prevUsed == null) { assert unitsChain.prevUsed != null || currUnit == unitsChain; currUnit.adjustedOwnership = currUnit.ownership + diff; currUnit.prevUsed = unitsChain; unitsChain = currUnit; } else { currUnit.adjustedOwnership += diff; } } double calculateUnitChange(double newUnitMult, double optTokenOwnership) { double unitChange = 0; UnitInfo<Unit> unitsChain = this.unitsChain; // Now loop through the units chain and add the unit-level changes. Also clear the groups' seen marks. while (true) { double newOwnership = unitsChain.adjustedOwnership; double oldOwnership = unitsChain.ownership; double tokenCount = unitsChain.tokenCount; double diff = (sq(newOwnership / tokenCount - optTokenOwnership) - sq(oldOwnership / tokenCount - optTokenOwnership)); UnitInfo<Unit> prev = unitsChain.prevUsed; unitsChain.prevUsed = null; if (unitsChain != prev) unitChange += diff; else { unitChange += diff * newUnitMult; break; } unitsChain = prev; } this.unitsChain = unitsChain; return unitChange; } } /** * Helper class for marking/unmarking visited a chain of groups */ private abstract class GroupVisitor { GroupInfo groupChain = GroupInfo.TERMINATOR; int seen = 0; abstract GroupInfo prevSeen(GroupInfo group); abstract void setPrevSeen(GroupInfo group, GroupInfo prevSeen); // true iff this is the first time we've visited this group boolean add(GroupInfo group) { if (prevSeen(group) != null) return false; ++seen; setPrevSeen(group, groupChain); groupChain = group; return true; } boolean visitedAll() { return seen >= replicas; } boolean seen(GroupInfo group) { return prevSeen(group) != null; } // Clean group seen markers. void clean() { GroupInfo groupChain = this.groupChain; while (groupChain != GroupInfo.TERMINATOR) { GroupInfo prev = prevSeen(groupChain); setPrevSeen(groupChain, null); groupChain = prev; } this.groupChain = GroupInfo.TERMINATOR; } } private class ReplicationVisitor extends GroupVisitor { GroupInfo prevSeen(GroupInfo group) { return group.prevSeen; } void setPrevSeen(GroupInfo group, GroupInfo prevSeen) { group.prevSeen = prevSeen; } } private class PopulateVisitor extends GroupVisitor { GroupInfo prevSeen(GroupInfo group) { return group.prevPopulate; } void setPrevSeen(GroupInfo group, GroupInfo prevSeen) { group.prevPopulate = prevSeen; } } private double optimalTokenOwnership(int tokensToAdd) { return 1.0 * replicas / (sortedTokens.size() + tokensToAdd); } /** * Selects from {@code t1}, {@code t2} the token that forms a bigger range with {@code towards} as the upper bound, * taking into account wrapping. * Unlike Token.size(), equality is taken to mean "same as" rather than covering the whole range. */ private static Token furtherStartToken(Token t1, Token t2, Token towards) { if (t1.equals(towards)) return t2; if (t2.equals(towards)) return t1; return t1.size(towards) > t2.size(towards) ? t1 : t2; } private static double sq(double d) { return d * d; } /** * For testing, remove the given unit preserving correct state of the allocator. */ void removeUnit(Unit n) { Collection<Token> tokens = unitToTokens.removeAll(n); sortedTokens.keySet().removeAll(tokens); } public int unitCount() { return unitToTokens.asMap().size(); } public String toString() { return getClass().getSimpleName(); } /** * TokenInfo about candidate new tokens/vnodes. */ private static class CandidateInfo<Unit> extends BaseTokenInfo<Unit, CandidateInfo<Unit>> { // directly preceding token in the current token ring final TokenInfo<Unit> split; public CandidateInfo(Token token, TokenInfo<Unit> split, UnitInfo<Unit> owningUnit) { super(token, owningUnit); this.split = split; } TokenInfo<Unit> prevInRing() { return split.prev; } } static void dumpTokens(String lead, BaseTokenInfo<?, ?> tokens) { BaseTokenInfo<?, ?> token = tokens; do { System.out.format("%s%s: rs %s rt %s size %.2e%n", lead, token, token.replicationStart, token.replicationThreshold, token.replicatedOwnership); token = token.next; } while (token != null && token != tokens); } }