/* * Copyright © 2014 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package co.cask.cdap.common.zookeeper.coordination; import com.google.common.collect.MinMaxPriorityQueue; import com.google.common.collect.Multimap; import com.google.common.primitives.Ints; import java.util.Collection; import java.util.Set; /** * A {@link AssignmentStrategy} that tries to balance partition replica assignment with minimum movement. */ public class BalancedAssignmentStrategy implements AssignmentStrategy { @Override public <T> void assign(ResourceRequirement requirement, Set<T> handlers, ResourceAssigner<T> assigner) { MinMaxPriorityQueue<HandlerSize<T>> handlerQueue = MinMaxPriorityQueue.create(); Multimap<T, PartitionReplica> assignments = assigner.get(); // Compute for each handler how many partition replica is already assigned for (T handler : handlers) { handlerQueue.add(new HandlerSize<>(handler, assignments)); } // For each unassigned partition replica in the requirement, assign it to the handler // with smallest partition replica assigned. It's just a heuristic to make the later balance phase doing less work. int totalPartitionReplica = 0; for (ResourceRequirement.Partition partition : requirement.getPartitions()) { totalPartitionReplica += partition.getReplicas(); for (int replica = 0; replica < partition.getReplicas(); replica++) { if (assigner.getHandler(partition.getName(), replica) == null) { HandlerSize<T> handlerSize = handlerQueue.removeFirst(); assigner.set(handlerSize.getHandler(), partition.getName(), replica); // After assignment, the size should get updated, hence put it back to the queue for next round usage. handlerQueue.add(handlerSize); } } } // Balance if (totalPartitionReplica > handlers.size()) { balance(handlerQueue, assigner, 1); } else { // Evenly distribute it to the first N handlers. while (handlerQueue.size() > totalPartitionReplica) { // If number of handler is > total partition replica, // there must be at least 1 handler that has nothing assigned, handlerQueue.removeFirst(); } // Balance it evenly, and there should be no differences in number of partition replica assigned to each handler. balance(handlerQueue, assigner, 0); } } /** * Balance the assignment by spreading it across all handlers evenly. * * @param handlerQueue The priority queue for tracking number of resources assigned to a given handler. * @param assigner The assigner for changing the assignment. * @param maxDiff The maximum differences between the handlers that has the most resources assigned vs the one with * the least resources assigned. */ private <T> void balance(MinMaxPriorityQueue<HandlerSize<T>> handlerQueue, ResourceAssigner<T> assigner, int maxDiff) { HandlerSize<T> minHandler = handlerQueue.peekFirst(); HandlerSize<T> maxHandler = handlerQueue.peekLast(); // Move assignment from the handler that has the most assigned partition replica to the least one, until the // differences is within the desired range. Multimap<T, PartitionReplica> assignments = assigner.get(); while (maxHandler.getSize() - minHandler.getSize() > maxDiff) { PartitionReplica partitionReplica = assignments.get(maxHandler.getHandler()).iterator().next(); // Remove min and max from the queue, and perform the reassignment. handlerQueue.removeFirst(); handlerQueue.removeLast(); assigner.set(minHandler.getHandler(), partitionReplica); // After assignment, the corresponding size should get updated, hence put it back to the queue for next iteration. handlerQueue.add(minHandler); handlerQueue.add(maxHandler); minHandler = handlerQueue.peekFirst(); maxHandler = handlerQueue.peekLast(); } } /** * This class records number of partition replica assigned to a handler. It is used for priority queue for * fast retrieval of handler with the min/max number of resources assigned. * * @param <T> Type of resource handler. */ private static final class HandlerSize<T> implements Comparable<HandlerSize<T>> { private final T handler; // This is a live view from the assignments multimap. Updates to the multimap will update this view. private final Collection<PartitionReplica> assigned; private HandlerSize(T handler, Multimap<T, PartitionReplica> assignments) { this.handler = handler; this.assigned = assignments.get(handler); } public T getHandler() { return handler; } public int getSize() { return assigned.size(); } @Override public int compareTo(HandlerSize<T> o) { return Ints.compare(getSize(), o.getSize()); } } }