/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.cassandra.dht.tokenallocator; import java.util.*; import junit.framework.Assert; import com.google.common.collect.Iterables; import com.google.common.collect.Maps; import com.google.common.collect.Sets; import org.apache.commons.math3.stat.descriptive.SummaryStatistics; import org.junit.Test; import org.apache.cassandra.Util; import org.apache.cassandra.dht.Murmur3Partitioner; import org.apache.cassandra.dht.Token; public class ReplicationAwareTokenAllocatorTest { private static final int MAX_VNODE_COUNT = 64; private static final int TARGET_CLUSTER_SIZE = 250; interface TestReplicationStrategy extends ReplicationStrategy<Unit> { void addUnit(Unit n); void removeUnit(Unit n); /** * Returns a list of all replica units for given token. */ List<Unit> getReplicas(Token token, NavigableMap<Token, Unit> sortedTokens); /** * Returns the start of the token span that is replicated in this token. * Note: Though this is not trivial to see, the replicated span is always contiguous. A token in the same * group acts as a barrier; if one is not found the token replicates everything up to the replica'th distinct * group seen in front of it. */ Token replicationStart(Token token, Unit unit, NavigableMap<Token, Unit> sortedTokens); /** * Multiplier for the acceptable disbalance in the cluster. With some strategies it is harder to achieve good * results. */ public double spreadExpectation(); } static class NoReplicationStrategy implements TestReplicationStrategy { public List<Unit> getReplicas(Token token, NavigableMap<Token, Unit> sortedTokens) { return Collections.singletonList(sortedTokens.ceilingEntry(token).getValue()); } public Token replicationStart(Token token, Unit unit, NavigableMap<Token, Unit> sortedTokens) { return sortedTokens.lowerKey(token); } public String toString() { return "No replication"; } public void addUnit(Unit n) { } public void removeUnit(Unit n) { } public int replicas() { return 1; } public boolean sameGroup(Unit n1, Unit n2) { return false; } public Object getGroup(Unit unit) { return unit; } public double spreadExpectation() { return 1; } } static class SimpleReplicationStrategy implements TestReplicationStrategy { int replicas; public SimpleReplicationStrategy(int replicas) { super(); this.replicas = replicas; } public List<Unit> getReplicas(Token token, NavigableMap<Token, Unit> sortedTokens) { List<Unit> endpoints = new ArrayList<Unit>(replicas); token = sortedTokens.ceilingKey(token); if (token == null) token = sortedTokens.firstKey(); Iterator<Unit> iter = Iterables.concat(sortedTokens.tailMap(token, true).values(), sortedTokens.values()).iterator(); while (endpoints.size() < replicas) { if (!iter.hasNext()) return endpoints; Unit ep = iter.next(); if (!endpoints.contains(ep)) endpoints.add(ep); } return endpoints; } public Token replicationStart(Token token, Unit unit, NavigableMap<Token, Unit> sortedTokens) { Set<Unit> seenUnits = Sets.newHashSet(); int unitsFound = 0; for (Map.Entry<Token, Unit> en : Iterables.concat( sortedTokens.headMap(token, false).descendingMap().entrySet(), sortedTokens.descendingMap().entrySet())) { Unit n = en.getValue(); // Same group as investigated unit is a break; anything that could replicate in it replicates there. if (n == unit) break; if (seenUnits.add(n)) { if (++unitsFound == replicas) break; } token = en.getKey(); } return token; } public void addUnit(Unit n) { } public void removeUnit(Unit n) { } public String toString() { return String.format("Simple %d replicas", replicas); } public int replicas() { return replicas; } public boolean sameGroup(Unit n1, Unit n2) { return false; } public Unit getGroup(Unit unit) { // The unit is the group. return unit; } public double spreadExpectation() { return 1; } } static abstract class GroupReplicationStrategy implements TestReplicationStrategy { final int replicas; final Map<Unit, Integer> groupMap; public GroupReplicationStrategy(int replicas) { this.replicas = replicas; this.groupMap = Maps.newHashMap(); } public List<Unit> getReplicas(Token token, NavigableMap<Token, Unit> sortedTokens) { List<Unit> endpoints = new ArrayList<Unit>(replicas); BitSet usedGroups = new BitSet(); if (sortedTokens.isEmpty()) return endpoints; token = sortedTokens.ceilingKey(token); if (token == null) token = sortedTokens.firstKey(); Iterator<Unit> iter = Iterables.concat(sortedTokens.tailMap(token, true).values(), sortedTokens.values()).iterator(); while (endpoints.size() < replicas) { // For simlicity assuming list can't be exhausted before finding all replicas. Unit ep = iter.next(); int group = groupMap.get(ep); if (!usedGroups.get(group)) { endpoints.add(ep); usedGroups.set(group); } } return endpoints; } public Token lastReplicaToken(Token token, NavigableMap<Token, Unit> sortedTokens) { BitSet usedGroups = new BitSet(); int groupsFound = 0; token = sortedTokens.ceilingKey(token); if (token == null) token = sortedTokens.firstKey(); for (Map.Entry<Token, Unit> en : Iterables.concat(sortedTokens.tailMap(token, true).entrySet(), sortedTokens.entrySet())) { Unit ep = en.getValue(); int group = groupMap.get(ep); if (!usedGroups.get(group)) { usedGroups.set(group); if (++groupsFound >= replicas) return en.getKey(); } } return token; } public Token replicationStart(Token token, Unit unit, NavigableMap<Token, Unit> sortedTokens) { // replicated ownership int unitGroup = groupMap.get(unit); // unit must be already added BitSet seenGroups = new BitSet(); int groupsFound = 0; for (Map.Entry<Token, Unit> en : Iterables.concat( sortedTokens.headMap(token, false).descendingMap().entrySet(), sortedTokens.descendingMap().entrySet())) { Unit n = en.getValue(); int ngroup = groupMap.get(n); // Same group as investigated unit is a break; anything that could replicate in it replicates there. if (ngroup == unitGroup) break; if (!seenGroups.get(ngroup)) { if (++groupsFound == replicas) break; seenGroups.set(ngroup); } token = en.getKey(); } return token; } public String toString() { Map<Integer, Integer> idToSize = instanceToCount(groupMap); Map<Integer, Integer> sizeToCount = Maps.newTreeMap(); sizeToCount.putAll(instanceToCount(idToSize)); return String.format("%s strategy, %d replicas, group size to count %s", getClass().getSimpleName(), replicas, sizeToCount); } @Override public int replicas() { return replicas; } public boolean sameGroup(Unit n1, Unit n2) { return groupMap.get(n1).equals(groupMap.get(n2)); } public void removeUnit(Unit n) { groupMap.remove(n); } public Integer getGroup(Unit unit) { return groupMap.get(unit); } public double spreadExpectation() { return 1.5; // Even balanced racks get disbalanced when they lose nodes. } } private static <T> Map<T, Integer> instanceToCount(Map<?, T> map) { Map<T, Integer> idToCount = Maps.newHashMap(); for (Map.Entry<?, T> en : map.entrySet()) { Integer old = idToCount.get(en.getValue()); idToCount.put(en.getValue(), old != null ? old + 1 : 1); } return idToCount; } /** * Group strategy spreading units into a fixed number of groups. */ static class FixedGroupCountReplicationStrategy extends GroupReplicationStrategy { int groupId; int groupCount; public FixedGroupCountReplicationStrategy(int replicas, int groupCount) { super(replicas); assert groupCount >= replicas; groupId = 0; this.groupCount = groupCount; } public void addUnit(Unit n) { groupMap.put(n, groupId++ % groupCount); } } /** * Group strategy with a fixed number of units per group. */ static class BalancedGroupReplicationStrategy extends GroupReplicationStrategy { int groupId; int groupSize; public BalancedGroupReplicationStrategy(int replicas, int groupSize) { super(replicas); groupId = 0; this.groupSize = groupSize; } public void addUnit(Unit n) { groupMap.put(n, groupId++ / groupSize); } } static class UnbalancedGroupReplicationStrategy extends GroupReplicationStrategy { int groupId; int nextSize; int num; int minGroupSize; int maxGroupSize; Random rand; public UnbalancedGroupReplicationStrategy(int replicas, int minGroupSize, int maxGroupSize, Random rand) { super(replicas); groupId = -1; nextSize = 0; num = 0; this.maxGroupSize = maxGroupSize; this.minGroupSize = minGroupSize; this.rand = rand; } public void addUnit(Unit n) { if (++num > nextSize) { nextSize = minGroupSize + rand.nextInt(maxGroupSize - minGroupSize + 1); ++groupId; num = 0; } groupMap.put(n, groupId); } public double spreadExpectation() { return 2; } } static Map<Unit, Double> evaluateReplicatedOwnership(ReplicationAwareTokenAllocator<Unit> t) { Map<Unit, Double> ownership = Maps.newHashMap(); Iterator<Token> it = t.sortedTokens.keySet().iterator(); if (!it.hasNext()) return ownership; Token current = it.next(); while (it.hasNext()) { Token next = it.next(); addOwnership(t, current, next, ownership); current = next; } addOwnership(t, current, t.sortedTokens.firstKey(), ownership); return ownership; } private static void addOwnership(ReplicationAwareTokenAllocator<Unit> t, Token current, Token next, Map<Unit, Double> ownership) { TestReplicationStrategy ts = (TestReplicationStrategy) t.strategy; double size = current.size(next); Token representative = t.partitioner.midpoint(current, next); for (Unit n : ts.getReplicas(representative, t.sortedTokens)) { Double v = ownership.get(n); ownership.put(n, v != null ? v + size : size); } } private static double replicatedTokenOwnership(Token token, NavigableMap<Token, Unit> sortedTokens, ReplicationStrategy<Unit> strategy) { TestReplicationStrategy ts = (TestReplicationStrategy) strategy; Token next = sortedTokens.higherKey(token); if (next == null) next = sortedTokens.firstKey(); return ts.replicationStart(token, sortedTokens.get(token), sortedTokens).size(next); } static interface TokenCount { int tokenCount(int perUnitCount, Random rand); double spreadExpectation(); } static TokenCount fixedTokenCount = new TokenCount() { public int tokenCount(int perUnitCount, Random rand) { return perUnitCount; } public double spreadExpectation() { return 4; // High tolerance to avoid flakiness. } }; static TokenCount varyingTokenCount = new TokenCount() { public int tokenCount(int perUnitCount, Random rand) { if (perUnitCount == 1) return 1; // 25 to 175% return rand.nextInt(perUnitCount * 3 / 2) + (perUnitCount + 3) / 4; } public double spreadExpectation() { return 8; // High tolerance to avoid flakiness. } }; Murmur3Partitioner partitioner = new Murmur3Partitioner(); Random seededRand = new Random(2); private void random(Map<Token, Unit> map, TestReplicationStrategy rs, int unitCount, TokenCount tc, int perUnitCount) { System.out.format("\nRandom generation of %d units with %d tokens each\n", unitCount, perUnitCount); Random rand = seededRand; for (int i = 0; i < unitCount; i++) { Unit unit = new Unit(); rs.addUnit(unit); int tokens = tc.tokenCount(perUnitCount, rand); for (int j = 0; j < tokens; j++) { map.put(partitioner.getRandomToken(rand), unit); } } } @Test public void testExistingCluster() { for (int rf = 1; rf <= 5; ++rf) { for (int perUnitCount = 1; perUnitCount <= MAX_VNODE_COUNT; perUnitCount *= 4) { testExistingCluster(perUnitCount, fixedTokenCount, new SimpleReplicationStrategy(rf)); testExistingCluster(perUnitCount, varyingTokenCount, new SimpleReplicationStrategy(rf)); if (rf == 1) continue; // Replication strategy doesn't matter for RF = 1. for (int groupSize = 4; groupSize <= 64 && groupSize * rf * 4 < TARGET_CLUSTER_SIZE; groupSize *= 4) { testExistingCluster(perUnitCount, fixedTokenCount, new BalancedGroupReplicationStrategy(rf, groupSize)); testExistingCluster(perUnitCount, varyingTokenCount, new UnbalancedGroupReplicationStrategy(rf, groupSize / 2, groupSize * 2, seededRand)); } testExistingCluster(perUnitCount, fixedTokenCount, new FixedGroupCountReplicationStrategy(rf, rf * 2)); } } } public void testExistingCluster(int perUnitCount, TokenCount tc, TestReplicationStrategy rs) { System.out.println("Testing existing cluster, target " + perUnitCount + " vnodes, replication " + rs); final int targetClusterSize = TARGET_CLUSTER_SIZE; NavigableMap<Token, Unit> tokenMap = Maps.newTreeMap(); random(tokenMap, rs, targetClusterSize / 2, tc, perUnitCount); ReplicationAwareTokenAllocator<Unit> t = new ReplicationAwareTokenAllocator<>(tokenMap, rs, partitioner); grow(t, targetClusterSize * 9 / 10, tc, perUnitCount, false); grow(t, targetClusterSize, tc, perUnitCount, true); loseAndReplace(t, targetClusterSize / 10, tc, perUnitCount); System.out.println(); } @Test public void testNewCluster() { Util.flakyTest(this::flakyTestNewCluster, 5, "It tends to fail sometimes due to the random selection of the tokens in the first few nodes."); } public void flakyTestNewCluster() { // This test is flaky because the selection of the tokens for the first RF nodes (which is random, with an // uncontrolled seed) can sometimes cause a pathological situation where the algorithm will find a (close to) // ideal distribution of tokens for some number of nodes, which in turn will inevitably cause it to go into a // bad (unacceptable to the test criteria) distribution after adding one more node. // This should happen very rarely, unless something is broken in the token allocation code. for (int rf = 2; rf <= 5; ++rf) { for (int perUnitCount = 1; perUnitCount <= MAX_VNODE_COUNT; perUnitCount *= 4) { testNewCluster(perUnitCount, fixedTokenCount, new SimpleReplicationStrategy(rf)); testNewCluster(perUnitCount, varyingTokenCount, new SimpleReplicationStrategy(rf)); if (rf == 1) continue; // Replication strategy doesn't matter for RF = 1. for (int groupSize = 4; groupSize <= 64 && groupSize * rf * 8 < TARGET_CLUSTER_SIZE; groupSize *= 4) { testNewCluster(perUnitCount, fixedTokenCount, new BalancedGroupReplicationStrategy(rf, groupSize)); testNewCluster(perUnitCount, varyingTokenCount, new UnbalancedGroupReplicationStrategy(rf, groupSize / 2, groupSize * 2, seededRand)); } testNewCluster(perUnitCount, fixedTokenCount, new FixedGroupCountReplicationStrategy(rf, rf * 2)); } } } public void testNewCluster(int perUnitCount, TokenCount tc, TestReplicationStrategy rs) { System.out.println("Testing new cluster, target " + perUnitCount + " vnodes, replication " + rs); final int targetClusterSize = TARGET_CLUSTER_SIZE; NavigableMap<Token, Unit> tokenMap = Maps.newTreeMap(); ReplicationAwareTokenAllocator<Unit> t = new ReplicationAwareTokenAllocator<>(tokenMap, rs, partitioner); grow(t, targetClusterSize * 2 / 5, tc, perUnitCount, false); grow(t, targetClusterSize, tc, perUnitCount, true); loseAndReplace(t, targetClusterSize / 5, tc, perUnitCount); System.out.println(); } private void loseAndReplace(ReplicationAwareTokenAllocator<Unit> t, int howMany, TokenCount tc, int perUnitCount) { int fullCount = t.unitCount(); System.out.format("Losing %d units. ", howMany); for (int i = 0; i < howMany; ++i) { Unit u = t.unitFor(partitioner.getRandomToken(seededRand)); t.removeUnit(u); ((TestReplicationStrategy) t.strategy).removeUnit(u); } // Grow half without verifying. grow(t, (t.unitCount() + fullCount * 3) / 4, tc, perUnitCount, false); // Metrics should be back to normal by now. Check that they remain so. grow(t, fullCount, tc, perUnitCount, true); } static class Summary { double min = 1; double max = 1; double stddev = 0; void update(SummaryStatistics stat) { min = Math.min(min, stat.getMin()); max = Math.max(max, stat.getMax()); stddev = Math.max(stddev, stat.getStandardDeviation()); } public String toString() { return String.format("max %.2f min %.2f stddev %.4f", max, min, stddev); } } public void grow(ReplicationAwareTokenAllocator<Unit> t, int targetClusterSize, TokenCount tc, int perUnitCount, boolean verifyMetrics) { int size = t.unitCount(); Summary su = new Summary(); Summary st = new Summary(); Random rand = new Random(targetClusterSize + perUnitCount); TestReplicationStrategy strategy = (TestReplicationStrategy) t.strategy; if (size < targetClusterSize) { System.out.format("Adding %d unit(s) using %s...", targetClusterSize - size, t.toString()); long time = System.currentTimeMillis(); while (size < targetClusterSize) { int tokens = tc.tokenCount(perUnitCount, rand); Unit unit = new Unit(); strategy.addUnit(unit); t.addUnit(unit, tokens); ++size; if (verifyMetrics) updateSummary(t, su, st, false); } System.out.format(" Done in %.3fs\n", (System.currentTimeMillis() - time) / 1000.0); if (verifyMetrics) { updateSummary(t, su, st, true); double maxExpected = 1.0 + tc.spreadExpectation() * strategy.spreadExpectation() / (perUnitCount * t.replicas); if (su.max > maxExpected) { Assert.fail(String.format("Expected max unit size below %.4f, was %.4f", maxExpected, su.max)); } // We can't verify lower side range as small loads can't always be fixed. } } } private void updateSummary(ReplicationAwareTokenAllocator<Unit> t, Summary su, Summary st, boolean print) { int size = t.sortedTokens.size(); double inverseAverage = 1.0 * size / t.strategy.replicas(); Map<Unit, Double> ownership = evaluateReplicatedOwnership(t); SummaryStatistics unitStat = new SummaryStatistics(); for (Map.Entry<Unit, Double> en : ownership.entrySet()) unitStat.addValue(en.getValue() * inverseAverage / t.unitToTokens.get(en.getKey()).size()); su.update(unitStat); SummaryStatistics tokenStat = new SummaryStatistics(); for (Token tok : t.sortedTokens.keySet()) tokenStat.addValue(replicatedTokenOwnership(tok, t.sortedTokens, t.strategy) * inverseAverage); st.update(tokenStat); if (print) { System.out.format("Size %d(%d) \tunit %s token %s %s\n", t.unitCount(), size, mms(unitStat), mms(tokenStat), t.strategy); System.out.format("Worst intermediate unit\t%s token %s\n", su, st); } } private static String mms(SummaryStatistics s) { return String.format("max %.2f min %.2f stddev %.4f", s.getMax(), s.getMin(), s.getStandardDeviation()); } int nextUnitId = 0; final class Unit implements Comparable<Unit> { int unitId = nextUnitId++; public String toString() { return Integer.toString(unitId); } @Override public int compareTo(Unit o) { return Integer.compare(unitId, o.unitId); } } }