/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.kafka.clients.consumer; import java.io.Serializable; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.TreeSet; import org.apache.kafka.clients.consumer.internals.AbstractPartitionAssignor; import org.apache.kafka.common.TopicPartition; import org.apache.kafka.common.protocol.types.ArrayOf; import org.apache.kafka.common.protocol.types.Field; import org.apache.kafka.common.protocol.types.Schema; import org.apache.kafka.common.protocol.types.Struct; import org.apache.kafka.common.protocol.types.Type; import org.apache.kafka.common.utils.CollectionUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * The sticky assignor serves two purposes. First, it guarantees an assignment that is as balanced as possible, meaning either: * - the numbers of topic partitions assigned to consumers differ by at most one; or * - each consumer that has 2+ fewer topic partitions than some other consumer cannot get any of those topic partitions transferred to it. * Second, it preserved as many existing assignment as possible when a reassignment occurs. This helps in saving some of the * overhead processing when topic partitions move from one consumer to another. * * Starting fresh it would work by distributing the partitions over consumers as evenly as possible. Even though this may sound similar to * how round robin assignor works, the second example below shows that it is not. * During a reassignment it would perform the reassignment in such a way that in the new assignment * 1. topic partitions are still distributed as evenly as possible, and * 2. topic partitions stay with their previously assigned consumers as much as possible. * Of course, the first goal above takes precedence over the second one. * * <b>Example 1.</b> Suppose there are three consumers <code>C0</code>, <code>C1</code>, <code>C2</code>, * four topics <code>t0,</code> <code>t1</code>, <code>t2</code>, <code>t3</code>, and each topic has 2 partitions, * resulting in partitions <code>t0p0</code>, <code>t0p1</code>, <code>t1p0</code>, <code>t1p1</code>, <code>t2p0</code>, * <code>t2p1</code>, <code>t3p0</code>, <code>t3p1</code>. Each consumer is subscribed to all three topics. * * The assignment with both sticky and round robin assignors will be: * <ul> * <li><code>C0: [t0p0, t1p1, t3p0]<code></li> * <li><code>C1: [t0p1, t2p0, t3p1]<code></li> * <li><code>C2: [t1p0, t2p1]<code></li> * </ul> * * Now, let's assume <code>C1</code> is removed and a reassignment is about to happen. The round robin assignor would produce: * <ul> * <li><code>C0: [t0p0, t1p0, t2p0, t3p0]</code></li> * <li><code>C2: [t0p1, t1p1, t2p1, t3p1]</code></li> * </ul> * * while the sticky assignor would result in: * <ul> * <li><code>C0 [t0p0, t1p1, t3p0, t2p0]</code></li> * <li><code>C2 [t1p0, t2p1, t0p1, t3p1]</code></li> * </ul> * preserving all the previous assignments (unlike the round robin assignor). * * <b>Example 2.</b> There are three consumers <code>C0</code>, <code>C1</code>, <code>C2</code>, * and three topics <code>t0</code>, <code>t1</code>, <code>t2</code>, with 1, 2, and 3 partitions respectively. * Therefore, the partitions are <code>t0p0</code>, <code>t1p0</code>, <code>t1p1</code>, <code>t2p0</code>, * <code>t2p1</code>, <code>t2p2</code>. <code>C0</code> is subscribed to <code>t0</code>; <code>C1</code> is subscribed to * <code>t0</code>, <code>t1</code>; and <code>C2</code> is subscribed to <code>t0</code>, <code>t1</code>, <code>t2</code>. * * The round robin assignor would come up with the following assignment: * <ul> * <li><code>C0 [t0p0]</code></li> * <li><code>C1 [t1p0]</code></li> * <li><code>C2 [t1p1, t2p0, t2p1, t2p2]</code></li> * </ul> * * which is not as balanced as the assignment suggested by sticky assignor: * <ul> * <li><code>C0 [t0p0]</code></li> * <li><code>C1 [t1p0, t1p1]</code></li> * <li><code>C2 [t2p0, t2p1, t2p2]</code></li> * </ul> * * Now, if consumer <code>C0</code> is removed, these two assignors would produce the following assignments. * Round Robin (preserves 3 partition assignments): * <ul> * <li><code>C1 [t0p0, t1p1]</code></li> * <li><code>C2 [t1p0, t2p0, t2p1, t2p2]</code></li> * </ul> * * Sticky (preserves 5 partition assignments): * <ul> * <li><code>C1 [t1p0, t1p1, t0p0]</code></li> * <li><code>C2 [t2p0, t2p1, t2p2]</code></li> * </ul> * * <h3>Impact on <code>ConsumerRebalanceListener</code></h3> * The sticky assignment strategy can provide some optimization to those consumers that have some partition cleanup code * in their <code>onPartitionsRevoked()</code> callback listeners. The cleanup code is placed in that callback listener * because the consumer has no assumption or hope of preserving any of its assigned partitions after a rebalance when it * is using range or round robin assignor. The listener code would look like this: * <code> * class TheOldRebalanceListener implements ConsumerRebalanceListener { * * void onPartitionsRevoked(Collection<TopicPartition> partitions) { * for (TopicPartition partition: partitions) { * commitOffsets(partition); * cleanupState(partition); * } * } * * void onPartitionsAssigned(Collection<TopicPartition> partitions) { * for (TopicPartition partition: partitions) { * initializeState(partition); * initializeOffset(partition); * } * } * } * </code> * * As mentioned above, one advantage of the sticky assignor is that, in general, it reduces the number of partitions that * actually move from one consumer to another during a reassignment. Therefore, it allows consumers to do their cleanup * more efficiently. Of course, they still can perform the partition cleanup in the <code>onPartitionsRevoked()</code> * listener, but they can be more efficient and make a note of their partitions before and after the rebalance, and do the * cleanup after the rebalance only on the partitions they have lost (which is normally not a lot). The code snippet below * clarifies this point: * <code> * class TheNewRebalanceListener implements ConsumerRebalanceListener { * Collection<TopicPartition> lastAssignment = Collections.emptyList(); * * void onPartitionsRevoked(Collection<TopicPartition> partitions) { * for (TopicPartition partition: partitions) * commitOffsets(partition); * } * * void onPartitionsAssigned(Collection<TopicPartition> assignment) { * for (TopicPartition partition: difference(lastAssignment, assignment)) * cleanupState(partition); * * for (TopicPartition partition: difference(assignment, lastAssignment)) * initializeState(partition); * * for (TopicPartition partition: assignment) * initializeOffset(partition); * * this.lastAssignment = assignment; * } * } * </code> * * Any consumer that uses sticky assignment can leverage this listener like this: * <code>consumer.subscribe(topics, new TheNewRebalanceListener());</code> * */ public class StickyAssignor extends AbstractPartitionAssignor { private static final Logger log = LoggerFactory.getLogger(StickyAssignor.class); // these schemas are used for preserving consumer's previously assigned partitions // list and sending it as user data to the leader during a rebalance private static final String TOPIC_PARTITIONS_KEY_NAME = "previous_assignment"; private static final String TOPIC_KEY_NAME = "topic"; private static final String PARTITIONS_KEY_NAME = "partitions"; private static final Schema TOPIC_ASSIGNMENT = new Schema( new Field(TOPIC_KEY_NAME, Type.STRING), new Field(PARTITIONS_KEY_NAME, new ArrayOf(Type.INT32))); private static final Schema STICKY_ASSIGNOR_USER_DATA = new Schema( new Field(TOPIC_PARTITIONS_KEY_NAME, new ArrayOf(TOPIC_ASSIGNMENT))); Map<String, List<TopicPartition>> currentAssignment = new HashMap<>(); private List<TopicPartition> memberAssignment = null; private PartitionMovements partitionMovements; public Map<String, List<TopicPartition>> assign(Map<String, Integer> partitionsPerTopic, Map<String, List<String>> subscriptions) { partitionMovements = new PartitionMovements(); prepopulateCurrentAssignments(); // make a deep copy of currentAssignment Map<String, List<TopicPartition>> oldAssignment = deepCopy(currentAssignment); // a mapping of all topic partitions to all consumers that can be assigned to them final HashMap<TopicPartition, List<String>> partition2AllPotentialConsumers = new HashMap<>(); // a mapping of all consumers to all potential topic partitions that can be assigned to them final HashMap<String, List<TopicPartition>> consumer2AllPotentialPartitions = new HashMap<>(); // initialize partition2AllPotentialConsumers and consumer2AllPotentialPartitions in the following two for loops for (Entry<String, Integer> entry: partitionsPerTopic.entrySet()) { for (int i = 0; i < entry.getValue(); ++i) partition2AllPotentialConsumers.put(new TopicPartition(entry.getKey(), i), new ArrayList<String>()); } for (Entry<String, List<String>> entry: subscriptions.entrySet()) { String consumer = entry.getKey(); consumer2AllPotentialPartitions.put(consumer, new ArrayList<TopicPartition>()); for (String topic: entry.getValue()) { for (int i = 0; i < partitionsPerTopic.get(topic); ++i) { TopicPartition topicPartition = new TopicPartition(topic, i); consumer2AllPotentialPartitions.get(consumer).add(topicPartition); partition2AllPotentialConsumers.get(topicPartition).add(consumer); } } // add this consumer to currentAssignment (with an empty topic partition assignment) if it does not already exist if (!currentAssignment.containsKey(consumer)) currentAssignment.put(consumer, new ArrayList<TopicPartition>()); } // a mapping of partition to current consumer HashMap<TopicPartition, String> currentPartitionConsumer = new HashMap<>(); for (Map.Entry<String, List<TopicPartition>> entry: currentAssignment.entrySet()) for (TopicPartition topicPartition: entry.getValue()) currentPartitionConsumer.put(topicPartition, entry.getKey()); List<TopicPartition> sortedPartitions = sortPartitions(oldAssignment.isEmpty(), partition2AllPotentialConsumers, consumer2AllPotentialPartitions); // all partitions that need to be assigned (initially set to all partitions but adjusted in the following loop) List<TopicPartition> unassignedPartitions = new ArrayList<>(sortedPartitions); for (Iterator<Map.Entry<String, List<TopicPartition>>> it = currentAssignment.entrySet().iterator(); it.hasNext();) { Map.Entry<String, List<TopicPartition>> entry = it.next(); if (!subscriptions.containsKey(entry.getKey())) { // if a consumer that existed before (and had some partition assignments) is now removed, remove it from currentAssignment for (TopicPartition topicPartition: entry.getValue()) currentPartitionConsumer.remove(topicPartition); it.remove(); } else { // otherwise (the consumer still exists) for (Iterator<TopicPartition> partitionIter = entry.getValue().iterator(); partitionIter.hasNext();) { TopicPartition partition = partitionIter.next(); if (!partition2AllPotentialConsumers.containsKey(partition)) { // if this topic partition of this consumer no longer exists remove it from currentAssignment of the consumer partitionIter.remove(); currentPartitionConsumer.remove(partition); } else if (!subscriptions.get(entry.getKey()).contains(partition.topic())) { // if this partition cannot remain assigned to its current consumer because the consumer // is no longer subscribed to its topic remove it from currentAssignment of the consumer partitionIter.remove(); } else // otherwise, remove the topic partition from those that need to be assigned only if // its current consumer is still subscribed to its topic (because it is already assigned // and we would want to preserve that assignment as much as possible) unassignedPartitions.remove(partition); } } } // at this point we have preserved all valid topic partition to consumer assignments and removed // all invalid topic partitions and invalid consumers. Now we need to assign unassignedPartitions // to consumers so that the topic partition assignments are as balanced as possible. // an ascending sorted set of consumers based on how many topic partitions are already assigned to them TreeSet<String> sortedCurrentSubscriptions = new TreeSet<>(new SubscriptionComparator(currentAssignment)); sortedCurrentSubscriptions.addAll(currentAssignment.keySet()); balance(sortedPartitions, unassignedPartitions, sortedCurrentSubscriptions, consumer2AllPotentialPartitions, partition2AllPotentialConsumers, oldAssignment, currentPartitionConsumer); return currentAssignment; } private void prepopulateCurrentAssignments() { Map<String, Subscription> subscriptions = getSubscriptions(); if (subscriptions == null) return; currentAssignment.clear(); for (Map.Entry<String, Subscription> subscriptionEntry : subscriptions.entrySet()) { ByteBuffer userData = subscriptionEntry.getValue().userData(); if (userData != null && userData.hasRemaining()) currentAssignment.put(subscriptionEntry.getKey(), deserializeTopicPartitionAssignment(userData)); } } @Override public void onAssignment(Assignment assignment) { memberAssignment = assignment.partitions(); } @Override public Subscription subscription(Set<String> topics) { if (memberAssignment == null) return new Subscription(new ArrayList<>(topics)); return new Subscription(new ArrayList<>(topics), serializeTopicPartitionAssignment(memberAssignment)); } @Override public String name() { return "sticky"; } /** * determine if the current assignment is a balanced one * * @param sortedCurrentSubscriptions: an ascending sorted set of consumers based on how many topic partitions are already assigned to them * @param allSubscriptions: a mapping of all consumers to all potential topic partitions that can be assigned to them * @return */ private boolean isBalanced(TreeSet<String> sortedCurrentSubscriptions, Map<String, List<TopicPartition>> allSubscriptions) { int min = currentAssignment.get(sortedCurrentSubscriptions.first()).size(); int max = currentAssignment.get(sortedCurrentSubscriptions.last()).size(); if (min >= max - 1) // if minimum and maximum numbers of partitions assigned to consumers differ by at most one return true return true; // create a mapping from partitions to the consumer assigned to them final HashMap<TopicPartition, String> allPartitions = new HashMap<>(); Set<Entry<String, List<TopicPartition>>> assignments = currentAssignment.entrySet(); for (Map.Entry<String, List<TopicPartition>> entry: assignments) { List<TopicPartition> topicPartitions = entry.getValue(); for (TopicPartition topicPartition: topicPartitions) { if (allPartitions.containsKey(topicPartition)) log.error(topicPartition + " is assigned to more than one consumer."); allPartitions.put(topicPartition, entry.getKey()); } } // for each consumer that does not have all the topic partitions it can get make sure none of the topic partitions it // could but did not get cannot be moved to it (because that would break the balance) for (String consumer: sortedCurrentSubscriptions) { List<TopicPartition> consumerPartitions = currentAssignment.get(consumer); int consumerPartitionCount = consumerPartitions.size(); // skip if this consumer already has all the topic partitions it can get if (consumerPartitionCount == allSubscriptions.get(consumer).size()) continue; // otherwise make sure it cannot get any more List<TopicPartition> potentialTopicPartitions = allSubscriptions.get(consumer); for (TopicPartition topicPartition: potentialTopicPartitions) { if (!currentAssignment.get(consumer).contains(topicPartition)) { String otherConsumer = allPartitions.get(topicPartition); int otherConsumerPartitionCount = currentAssignment.get(otherConsumer).size(); if (consumerPartitionCount < otherConsumerPartitionCount) { log.debug(topicPartition + " can be moved from consumer " + otherConsumer + " to consumer " + consumer + " for a more balanced assignment."); return false; } } } } return true; } /** * @return the balance score of the given assignment, as the sum of assigned partitions size difference of all consumer pairs. * A perfectly balanced assignment (with all consumers getting the same number of partitions) has a balance score of 0. * Lower balance score indicates a more balanced assignment. */ private int getBalanceScore(Map<String, List<TopicPartition>> assignment) { int score = 0; Map<String, Integer> consumer2AssignmentSize = new HashMap<>(); for (Entry<String, List<TopicPartition>> entry: assignment.entrySet()) consumer2AssignmentSize.put(entry.getKey(), entry.getValue().size()); Iterator<Entry<String, Integer>> it = consumer2AssignmentSize.entrySet().iterator(); while (it.hasNext()) { Entry<String, Integer> entry = it.next(); int consumerAssignmentSize = entry.getValue(); it.remove(); for (Entry<String, Integer> otherEntry: consumer2AssignmentSize.entrySet()) score += Math.abs(consumerAssignmentSize - otherEntry.getValue()); } return score; } /** * Sort valid partitions so they are processed in the potential reassignment phase in the proper order * that causes minimal partition movement among consumers (hence honoring maximal stickiness) * * @param isFreshAssignment whether this is a new assignment, or a reassignment of an existing one * @param partition2AllPotentialConsumers a mapping of partitions to their potential consumers * @param consumer2AllPotentialPartitions a mapping of consumers to potential partitions they can consumer from * @return sorted list of valid partitions */ private List<TopicPartition> sortPartitions(boolean isFreshAssignment, HashMap<TopicPartition, List<String>> partition2AllPotentialConsumers, HashMap<String, List<TopicPartition>> consumer2AllPotentialPartitions) { List<TopicPartition> sortedPartitions = new ArrayList<>(); if (!isFreshAssignment && areSubscriptionsIdentical(partition2AllPotentialConsumers, consumer2AllPotentialPartitions)) { // if this is a reassignment and the subscriptions are identical (all consumers can consumer from all topics) // then we just need to simply list partitions in a round robin fashion (from consumers with // most assigned partitions to those with least) Map<String, List<TopicPartition>> assignments = deepCopy(currentAssignment); for (Entry<String, List<TopicPartition>> entry: assignments.entrySet()) { List<TopicPartition> toRemove = new ArrayList<>(); for (TopicPartition partition: entry.getValue()) if (!partition2AllPotentialConsumers.keySet().contains(partition)) toRemove.add(partition); for (TopicPartition partition: toRemove) entry.getValue().remove(partition); } TreeSet<String> sortedConsumers = new TreeSet<>(new SubscriptionComparator(assignments)); sortedConsumers.addAll(assignments.keySet()); while (!sortedConsumers.isEmpty()) { String consumer = sortedConsumers.pollLast(); List<TopicPartition> remainingPartitions = assignments.get(consumer); if (!remainingPartitions.isEmpty()) { sortedPartitions.add(remainingPartitions.remove(0)); sortedConsumers.add(consumer); } } for (TopicPartition partition: partition2AllPotentialConsumers.keySet()) { if (!sortedPartitions.contains(partition)) sortedPartitions.add(partition); } } else { // an ascending sorted set of topic partitions based on how many consumers can potentially use them TreeSet<TopicPartition> sortedAllPartitions = new TreeSet<>(new PartitionComparator(partition2AllPotentialConsumers)); sortedAllPartitions.addAll(partition2AllPotentialConsumers.keySet()); while (!sortedAllPartitions.isEmpty()) sortedPartitions.add(sortedAllPartitions.pollFirst()); } return sortedPartitions; } /** * @param partition2AllPotentialConsumers a mapping of partitions to their potential consumers * @param consumer2AllPotentialPartitions a mapping of consumers to potential partitions they can consumer from * @return true if potential consumers of partitions are the same, and potential partitions consumers can * consumer from are the same too */ private boolean areSubscriptionsIdentical(HashMap<TopicPartition, List<String>> partition2AllPotentialConsumers, HashMap<String, List<TopicPartition>> consumer2AllPotentialPartitions) { if (!hasIdenticalListElements(partition2AllPotentialConsumers.values())) return false; if (!hasIdenticalListElements(consumer2AllPotentialPartitions.values())) return false; return true; } /** * @param col a collection of elements of type list * @return true if all lists in the collection have the same members; false otherwise */ private <T> boolean hasIdenticalListElements(Collection<List<T>> col) { Iterator<List<T>> it = col.iterator(); List<T> cur = it.next(); while (it.hasNext()) { List<T> next = it.next(); if (!(cur.containsAll(next) && next.containsAll(cur))) return false; cur = next; } return true; } /** * @return the consumer to which the given partition is assigned. The assignment should improve the overall balance * of the partition assignments to consumers. */ private String assignPartition(TopicPartition partition, TreeSet<String> sortedCurrentSubscriptions, HashMap<String, List<TopicPartition>> consumer2AllPotentialPartitions, HashMap<TopicPartition, String> currentPartitionConsumer) { for (String consumer: sortedCurrentSubscriptions) { if (consumer2AllPotentialPartitions.get(consumer).contains(partition)) { sortedCurrentSubscriptions.remove(consumer); currentAssignment.get(consumer).add(partition); currentPartitionConsumer.put(partition, consumer); sortedCurrentSubscriptions.add(consumer); return consumer; } } return null; } private boolean canParticipateInReassignment(TopicPartition partition, HashMap<TopicPartition, List<String>> partition2AllPotentialConsumers) { // if a partition has two or more potential consumers it is subject to reassignment. return partition2AllPotentialConsumers.get(partition).size() >= 2; } private boolean canParticipateInReassignment(String consumer, HashMap<String, List<TopicPartition>> consumer2AllPotentialPartitions, HashMap<TopicPartition, List<String>> partition2AllPotentialConsumers) { List<TopicPartition> currentPartitions = currentAssignment.get(consumer); int currentAssignmentSize = currentPartitions.size(); int maxAssignmentSize = consumer2AllPotentialPartitions.get(consumer).size(); if (currentAssignmentSize > maxAssignmentSize) log.error("The consumer " + consumer + " is assigned more partitions than the maximum possible."); if (currentAssignmentSize < maxAssignmentSize) // if a consumer is not assigned all its potential partitions it is subject to reassignment return true; for (TopicPartition partition: currentPartitions) // if any of the partitions assigned to a consumer is subject to reassignment the consumer itself // is subject to reassignment if (canParticipateInReassignment(partition, partition2AllPotentialConsumers)) return true; return false; } /** * Balance the current assignment using the data structures created in the assign(...) method above. */ private void balance(List<TopicPartition> sortedPartitions, List<TopicPartition> unassignedPartitions, TreeSet<String> sortedCurrentSubscriptions, HashMap<String, List<TopicPartition>> consumer2AllPotentialPartitions, HashMap<TopicPartition, List<String>> partition2AllPotentialConsumers, Map<String, List<TopicPartition>> oldAssignment, HashMap<TopicPartition, String> currentPartitionConsumer) { boolean initializing = currentAssignment.get(sortedCurrentSubscriptions.last()).isEmpty(); boolean reassignmentPerformed = false; // assign all unassigned partitions for (TopicPartition partition: unassignedPartitions) { // skip if there is no potential consumer for the partition if (partition2AllPotentialConsumers.get(partition).isEmpty()) continue; assignPartition(partition, sortedCurrentSubscriptions, consumer2AllPotentialPartitions, currentPartitionConsumer); } // narrow down the reassignment scope to only those partitions that can actually be reassigned Set<TopicPartition> fixedPartitions = new HashSet<>(); for (TopicPartition partition: partition2AllPotentialConsumers.keySet()) if (!canParticipateInReassignment(partition, partition2AllPotentialConsumers)) fixedPartitions.add(partition); sortedPartitions.removeAll(fixedPartitions); // narrow down the reassignment scope to only those consumers that are subject to reassignment Map<String, List<TopicPartition>> fixedAssignments = new HashMap<>(); for (String consumer: consumer2AllPotentialPartitions.keySet()) if (!canParticipateInReassignment(consumer, consumer2AllPotentialPartitions, partition2AllPotentialConsumers)) { sortedCurrentSubscriptions.remove(consumer); fixedAssignments.put(consumer, currentAssignment.remove(consumer)); } // create a deep copy of the current assignment so we can revert to it if we do not get a more balanced assignment later Map<String, List<TopicPartition>> preBalanceAssignment = deepCopy(currentAssignment); HashMap<TopicPartition, String> preBalancePartitionConsumers = new HashMap<>(currentPartitionConsumer); reassignmentPerformed = performReassignments(sortedPartitions, sortedCurrentSubscriptions, consumer2AllPotentialPartitions, partition2AllPotentialConsumers, currentPartitionConsumer); // if we are not preserving existing assignments and we have made changes to the current assignment // make sure we are getting a more balanced assignment; otherwise, revert to previous assignment if (!initializing && reassignmentPerformed && getBalanceScore(currentAssignment) >= getBalanceScore(preBalanceAssignment)) { deepCopy(preBalanceAssignment, currentAssignment); currentPartitionConsumer.clear(); currentPartitionConsumer.putAll(preBalancePartitionConsumers); } // add the fixed assignments (those that could not change) back for (Entry<String, List<TopicPartition>> entry: fixedAssignments.entrySet()) { String consumer = entry.getKey(); currentAssignment.put(consumer, entry.getValue()); sortedCurrentSubscriptions.add(consumer); } fixedAssignments.clear(); } private boolean performReassignments(List<TopicPartition> reassignablePartitions, TreeSet<String> sortedCurrentSubscriptions, HashMap<String, List<TopicPartition>> consumer2AllPotentialPartitions, HashMap<TopicPartition, List<String>> partition2AllPotentialConsumers, HashMap<TopicPartition, String> currentPartitionConsumer) { boolean reassignmentPerformed = false; boolean modified; // repeat reassignment until no partition can be moved to improve the balance do { modified = false; // reassign all reassignable partitions (starting from the partition with least potential consumers and if needed) // until the full list is processed or a balance is achieved Iterator<TopicPartition> partitionIterator = reassignablePartitions.iterator(); while (partitionIterator.hasNext() && !isBalanced(sortedCurrentSubscriptions, consumer2AllPotentialPartitions)) { TopicPartition partition = partitionIterator.next(); // the partition must have at least two consumers if (partition2AllPotentialConsumers.get(partition).size() <= 1) log.error("Expected more than one potential consumer for partition '" + partition + "'"); // the partition must have a current consumer String consumer = currentPartitionConsumer.get(partition); if (consumer == null) log.error("Expected partition '" + partition + "' to be assigned to a consumer"); // check if a better-suited consumer exist for the partition; if so, reassign it for (String otherConsumer: partition2AllPotentialConsumers.get(partition)) { if (currentAssignment.get(consumer).size() > currentAssignment.get(otherConsumer).size() + 1) { reassignPartition(partition, sortedCurrentSubscriptions, currentPartitionConsumer, consumer2AllPotentialPartitions); reassignmentPerformed = true; modified = true; break; } } } } while (modified); return reassignmentPerformed; } private void reassignPartition(TopicPartition partition, TreeSet<String> sortedCurrentSubscriptions, HashMap<TopicPartition, String> currentPartitionConsumer, HashMap<String, List<TopicPartition>> consumer2AllPotentialPartitions) { String consumer = currentPartitionConsumer.get(partition); // find the new consumer String newConsumer = null; for (String anotherConsumer: sortedCurrentSubscriptions) { if (consumer2AllPotentialPartitions.get(anotherConsumer).contains(partition)) { newConsumer = anotherConsumer; break; } } assert newConsumer != null; // find the correct partition movement considering the stickiness requirement TopicPartition partitionToBeMoved = partitionMovements.getTheActualPartitionToBeMoved(partition, consumer, newConsumer); processPartitionMovement(partitionToBeMoved, newConsumer, sortedCurrentSubscriptions, currentPartitionConsumer); return; } private void processPartitionMovement(TopicPartition partition, String newConsumer, TreeSet<String> sortedCurrentSubscriptions, HashMap<TopicPartition, String> currentPartitionConsumer) { String oldConsumer = currentPartitionConsumer.get(partition); sortedCurrentSubscriptions.remove(oldConsumer); sortedCurrentSubscriptions.remove(newConsumer); partitionMovements.movePartition(partition, oldConsumer, newConsumer); currentAssignment.get(oldConsumer).remove(partition); currentAssignment.get(newConsumer).add(partition); currentPartitionConsumer.put(partition, newConsumer); sortedCurrentSubscriptions.add(newConsumer); sortedCurrentSubscriptions.add(oldConsumer); } boolean isSticky() { return partitionMovements.isSticky(); } private static ByteBuffer serializeTopicPartitionAssignment(List<TopicPartition> partitions) { Struct struct = new Struct(STICKY_ASSIGNOR_USER_DATA); List<Struct> topicAssignments = new ArrayList<>(); for (Map.Entry<String, List<Integer>> topicEntry : CollectionUtils.groupDataByTopic(partitions).entrySet()) { Struct topicAssignment = new Struct(TOPIC_ASSIGNMENT); topicAssignment.set(TOPIC_KEY_NAME, topicEntry.getKey()); topicAssignment.set(PARTITIONS_KEY_NAME, topicEntry.getValue().toArray()); topicAssignments.add(topicAssignment); } struct.set(TOPIC_PARTITIONS_KEY_NAME, topicAssignments.toArray()); ByteBuffer buffer = ByteBuffer.allocate(STICKY_ASSIGNOR_USER_DATA.sizeOf(struct)); STICKY_ASSIGNOR_USER_DATA.write(buffer, struct); buffer.flip(); return buffer; } private static List<TopicPartition> deserializeTopicPartitionAssignment(ByteBuffer buffer) { Struct struct = STICKY_ASSIGNOR_USER_DATA.read(buffer); List<TopicPartition> partitions = new ArrayList<>(); for (Object structObj : struct.getArray(TOPIC_PARTITIONS_KEY_NAME)) { Struct assignment = (Struct) structObj; String topic = assignment.getString(TOPIC_KEY_NAME); for (Object partitionObj : assignment.getArray(PARTITIONS_KEY_NAME)) { Integer partition = (Integer) partitionObj; partitions.add(new TopicPartition(topic, partition)); } } return partitions; } private void deepCopy(Map<String, List<TopicPartition>> source, Map<String, List<TopicPartition>> dest) { dest.clear(); for (Entry<String, List<TopicPartition>> entry: source.entrySet()) dest.put(entry.getKey(), new ArrayList<>(entry.getValue())); } private Map<String, List<TopicPartition>> deepCopy(Map<String, List<TopicPartition>> assignment) { Map<String, List<TopicPartition>> copy = new HashMap<>(); deepCopy(assignment, copy); return copy; } private static class PartitionComparator implements Comparator<TopicPartition>, Serializable { private static final long serialVersionUID = 1L; private Map<TopicPartition, List<String>> map; PartitionComparator(Map<TopicPartition, List<String>> map) { this.map = map; } @Override public int compare(TopicPartition o1, TopicPartition o2) { int ret = map.get(o1).size() - map.get(o2).size(); if (ret == 0) { ret = o1.topic().compareTo(o2.topic()); if (ret == 0) ret = o1.partition() - o2.partition(); } return ret; } } private static class SubscriptionComparator implements Comparator<String>, Serializable { private static final long serialVersionUID = 1L; private Map<String, List<TopicPartition>> map; SubscriptionComparator(Map<String, List<TopicPartition>> map) { this.map = map; } @Override public int compare(String o1, String o2) { int ret = map.get(o1).size() - map.get(o2).size(); if (ret == 0) ret = o1.compareTo(o2); return ret; } } /** * This class maintains some data structures to simplify lookup of partition movements among consumers. At each point of * time during a partition rebalance it keeps track of partition movements corresponding to each topic, and also possible * movement (in form a <code>ConsumerPair</code> object) for each partition. */ private static class PartitionMovements { private Map<String, Map<ConsumerPair, Set<TopicPartition>>> partitionMovementsByTopic = new HashMap<>(); private Map<TopicPartition, ConsumerPair> partitionMovements = new HashMap<>(); private ConsumerPair removeMovementRecordOfPartition(TopicPartition partition) { ConsumerPair pair = partitionMovements.remove(partition); String topic = partition.topic(); Map<ConsumerPair, Set<TopicPartition>> partitionMovementsForThisTopic = partitionMovementsByTopic.get(topic); partitionMovementsForThisTopic.get(pair).remove(partition); if (partitionMovementsForThisTopic.get(pair).isEmpty()) partitionMovementsForThisTopic.remove(pair); if (partitionMovementsByTopic.get(topic).isEmpty()) partitionMovementsByTopic.remove(topic); return pair; } private void addPartitionMovementRecord(TopicPartition partition, ConsumerPair pair) { partitionMovements.put(partition, pair); String topic = partition.topic(); if (!partitionMovementsByTopic.containsKey(topic)) partitionMovementsByTopic.put(topic, new HashMap<ConsumerPair, Set<TopicPartition>>()); Map<ConsumerPair, Set<TopicPartition>> partitionMovementsForThisTopic = partitionMovementsByTopic.get(topic); if (!partitionMovementsForThisTopic.containsKey(pair)) partitionMovementsForThisTopic.put(pair, new HashSet<TopicPartition>()); partitionMovementsForThisTopic.get(pair).add(partition); } private void movePartition(TopicPartition partition, String oldConsumer, String newConsumer) { ConsumerPair pair = new ConsumerPair(oldConsumer, newConsumer); if (partitionMovements.containsKey(partition)) { // this partition has previously moved ConsumerPair existingPair = removeMovementRecordOfPartition(partition); assert existingPair.dstMemberId.equals(oldConsumer); if (!existingPair.srcMemberId.equals(newConsumer)) { // the partition is not moving back to its previous consumer // return new ConsumerPair2(existingPair.src, newConsumer); addPartitionMovementRecord(partition, new ConsumerPair(existingPair.srcMemberId, newConsumer)); } } else addPartitionMovementRecord(partition, pair); } private TopicPartition getTheActualPartitionToBeMoved(TopicPartition partition, String oldConsumer, String newConsumer) { String topic = partition.topic(); if (!partitionMovementsByTopic.containsKey(topic)) return partition; if (partitionMovements.containsKey(partition)) { // this partition has previously moved assert oldConsumer.equals(partitionMovements.get(partition).dstMemberId); oldConsumer = partitionMovements.get(partition).srcMemberId; } Map<ConsumerPair, Set<TopicPartition>> partitionMovementsForThisTopic = partitionMovementsByTopic.get(topic); ConsumerPair reversePair = new ConsumerPair(newConsumer, oldConsumer); if (!partitionMovementsForThisTopic.containsKey(reversePair)) return partition; return partitionMovementsForThisTopic.get(reversePair).iterator().next(); } private boolean isLinked(String src, String dst, Set<ConsumerPair> pairs, List<String> currentPath) { if (src.equals(dst)) return false; if (pairs.isEmpty()) return false; if (new ConsumerPair(src, dst).in(pairs)) { currentPath.add(src); currentPath.add(dst); return true; } for (ConsumerPair pair: pairs) if (pair.srcMemberId.equals(src)) { Set<ConsumerPair> reducedSet = new HashSet<>(pairs); reducedSet.remove(pair); currentPath.add(pair.srcMemberId); return isLinked(pair.dstMemberId, dst, reducedSet, currentPath); } return false; } private boolean in(List<String> cycle, Set<List<String>> cycles) { List<String> superCycle = new ArrayList<>(cycle); superCycle.remove(superCycle.size() - 1); superCycle.addAll(cycle); for (List<String> foundCycle: cycles) { if (foundCycle.size() == cycle.size() && Collections.indexOfSubList(superCycle, foundCycle) != -1) return true; } return false; } private boolean hasCycles(Set<ConsumerPair> pairs) { Set<List<String>> cycles = new HashSet<>(); for (ConsumerPair pair: pairs) { Set<ConsumerPair> reducedPairs = new HashSet<>(pairs); reducedPairs.remove(pair); List<String> path = new ArrayList<>(Collections.singleton(pair.srcMemberId)); if (isLinked(pair.dstMemberId, pair.srcMemberId, reducedPairs, path) && !in(path, cycles)) { cycles.add(new ArrayList<>(path)); log.error("A cycle of length " + (path.size() - 1) + " was found: " + path.toString()); } } // for now we want to make sure there is no partition movements of the same topic between a pair of consumers. // the odds of finding a cycle among more than two consumers seem to be very low (according to various randomized // tests with the given sticky algorithm) that it should not worth the added complexity of handling those cases. for (List<String> cycle: cycles) if (cycle.size() == 3) // indicates a cycle of length 2 return true; return false; } private boolean isSticky() { for (Map.Entry<String, Map<ConsumerPair, Set<TopicPartition>>> topicMovements: this.partitionMovementsByTopic.entrySet()) { Set<ConsumerPair> topicMovementPairs = topicMovements.getValue().keySet(); if (hasCycles(topicMovementPairs)) { log.error("Stickiness is violated for topic " + topicMovements.getKey() + "\nPartition movements for this topic occurred among the following consumer pairs:" + "\n" + topicMovements.getValue().toString()); return false; } } return true; } } /** * <code>ConsumerPair</code> represents a pair of Kafka consumer ids involved in a partition reassignment. Each * <code>ConsumerPair</code> object, which contains a source (<code>src</code>) and a destination (<code>dst</code>) * element, normally corresponds to a particular partition or topic, and indicates that the particular partition or some * partition of the particular topic was moved from the source consumer to the destination consumer during the rebalance. * This class is used, through the <code>PartitionMovements</code> class, by the sticky assignor and helps in determining * whether a partition reassignment results in cycles among the generated graph of consumer pairs. */ private static class ConsumerPair { private final String srcMemberId; private final String dstMemberId; ConsumerPair(String srcMemberId, String dstMemberId) { this.srcMemberId = srcMemberId; this.dstMemberId = dstMemberId; } public String toString() { return this.srcMemberId + "->" + this.dstMemberId; } @Override public int hashCode() { final int prime = 31; int result = 1; result = prime * result + ((this.srcMemberId == null) ? 0 : this.srcMemberId.hashCode()); result = prime * result + ((this.dstMemberId == null) ? 0 : this.dstMemberId.hashCode()); return result; } @Override public boolean equals(Object obj) { if (obj == null) return false; if (!getClass().isInstance(obj)) return false; ConsumerPair otherPair = (ConsumerPair) obj; return this.srcMemberId.equals(otherPair.srcMemberId) && this.dstMemberId.equals(otherPair.dstMemberId); } private boolean in(Set<ConsumerPair> pairs) { for (ConsumerPair pair: pairs) if (this.equals(pair)) return true; return false; } } }