/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.master.balancer; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.NavigableMap; import java.util.Random; import java.util.TreeMap; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.hbase.HRegionInfo; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.master.AssignmentManager; import org.apache.hadoop.hbase.master.RegionPlan; import com.google.common.collect.MinMaxPriorityQueue; /** * Makes decisions about the placement and movement of Regions across * RegionServers. * * <p>Cluster-wide load balancing will occur only when there are no regions in * transition and according to a fixed period of a time using {@link #balanceCluster(Map)}. * * <p>Inline region placement with {@link #immediateAssignment} can be used when * the Master needs to handle closed regions that it currently does not have * a destination set for. This can happen during master failover. * * <p>On cluster startup, bulk assignment can be used to determine * locations for all Regions in a cluster. * * <p>This classes produces plans for the {@link AssignmentManager} to execute. */ @InterfaceAudience.Private public class DefaultLoadBalancer extends BaseLoadBalancer { private static final Log LOG = LogFactory.getLog(DefaultLoadBalancer.class); private static final Random RANDOM = new Random(System.currentTimeMillis()); private RegionInfoComparator riComparator = new RegionInfoComparator(); private RegionPlan.RegionPlanComparator rpComparator = new RegionPlan.RegionPlanComparator(); /** * Stores additional per-server information about the regions added/removed * during the run of the balancing algorithm. * * For servers that shed regions, we need to track which regions we have already * shed. <b>nextRegionForUnload</b> contains the index in the list of regions on * the server that is the next to be shed. */ static class BalanceInfo { private final int nextRegionForUnload; private int numRegionsAdded; public BalanceInfo(int nextRegionForUnload, int numRegionsAdded) { this.nextRegionForUnload = nextRegionForUnload; this.numRegionsAdded = numRegionsAdded; } int getNextRegionForUnload() { return nextRegionForUnload; } int getNumRegionsAdded() { return numRegionsAdded; } void setNumRegionsAdded(int numAdded) { this.numRegionsAdded = numAdded; } } /** * Generate a global load balancing plan according to the specified map of * server information to the most loaded regions of each server. * * The load balancing invariant is that all servers are within 1 region of the * average number of regions per server. If the average is an integer number, * all servers will be balanced to the average. Otherwise, all servers will * have either floor(average) or ceiling(average) regions. * * HBASE-3609 Modeled regionsToMove using Guava's MinMaxPriorityQueue so that * we can fetch from both ends of the queue. * At the beginning, we check whether there was empty region server * just discovered by Master. If so, we alternately choose new / old * regions from head / tail of regionsToMove, respectively. This alternation * avoids clustering young regions on the newly discovered region server. * Otherwise, we choose new regions from head of regionsToMove. * * Another improvement from HBASE-3609 is that we assign regions from * regionsToMove to underloaded servers in round-robin fashion. * Previously one underloaded server would be filled before we move onto * the next underloaded server, leading to clustering of young regions. * * Finally, we randomly shuffle underloaded servers so that they receive * offloaded regions relatively evenly across calls to balanceCluster(). * * The algorithm is currently implemented as such: * * <ol> * <li>Determine the two valid numbers of regions each server should have, * <b>MIN</b>=floor(average) and <b>MAX</b>=ceiling(average). * * <li>Iterate down the most loaded servers, shedding regions from each so * each server hosts exactly <b>MAX</b> regions. Stop once you reach a * server that already has <= <b>MAX</b> regions. * <p> * Order the regions to move from most recent to least. * * <li>Iterate down the least loaded servers, assigning regions so each server * has exactly </b>MIN</b> regions. Stop once you reach a server that * already has >= <b>MIN</b> regions. * * Regions being assigned to underloaded servers are those that were shed * in the previous step. It is possible that there were not enough * regions shed to fill each underloaded server to <b>MIN</b>. If so we * end up with a number of regions required to do so, <b>neededRegions</b>. * * It is also possible that we were able to fill each underloaded but ended * up with regions that were unassigned from overloaded servers but that * still do not have assignment. * * If neither of these conditions hold (no regions needed to fill the * underloaded servers, no regions leftover from overloaded servers), * we are done and return. Otherwise we handle these cases below. * * <li>If <b>neededRegions</b> is non-zero (still have underloaded servers), * we iterate the most loaded servers again, shedding a single server from * each (this brings them from having <b>MAX</b> regions to having * <b>MIN</b> regions). * * <li>We now definitely have more regions that need assignment, either from * the previous step or from the original shedding from overloaded servers. * Iterate the least loaded servers filling each to <b>MIN</b>. * * <li>If we still have more regions that need assignment, again iterate the * least loaded servers, this time giving each one (filling them to * </b>MAX</b>) until we run out. * * <li>All servers will now either host <b>MIN</b> or <b>MAX</b> regions. * * In addition, any server hosting >= <b>MAX</b> regions is guaranteed * to end up with <b>MAX</b> regions at the end of the balancing. This * ensures the minimal number of regions possible are moved. * </ol> * * TODO: We can at-most reassign the number of regions away from a particular * server to be how many they report as most loaded. * Should we just keep all assignment in memory? Any objections? * Does this mean we need HeapSize on HMaster? Or just careful monitor? * (current thinking is we will hold all assignments in memory) * * @param clusterState Map of regionservers and their load/region information to * a list of their most loaded regions * @return a list of regions to be moved, including source and destination, * or null if cluster is already balanced */ public List<RegionPlan> balanceCluster( Map<ServerName, List<HRegionInfo>> clusterMap) { boolean emptyRegionServerPresent = false; long startTime = System.currentTimeMillis(); ClusterLoadState cs = new ClusterLoadState(clusterMap); int numServers = cs.getNumServers(); if (numServers == 0) { LOG.debug("numServers=0 so skipping load balancing"); return null; } NavigableMap<ServerAndLoad, List<HRegionInfo>> serversByLoad = cs.getServersByLoad(); int numRegions = cs.getNumRegions(); if (!this.needsBalance(cs)) { // Skipped because no server outside (min,max) range float average = cs.getLoadAverage(); // for logging LOG.info("Skipping load balancing because balanced cluster; " + "servers=" + numServers + " " + "regions=" + numRegions + " average=" + average + " " + "mostloaded=" + serversByLoad.lastKey().getLoad() + " leastloaded=" + serversByLoad.firstKey().getLoad()); return null; } int min = numRegions / numServers; int max = numRegions % numServers == 0 ? min : min + 1; // Using to check balance result. StringBuilder strBalanceParam = new StringBuilder(); strBalanceParam.append("Balance parameter: numRegions=").append(numRegions) .append(", numServers=").append(numServers).append(", max=").append(max) .append(", min=").append(min); LOG.debug(strBalanceParam.toString()); // Balance the cluster // TODO: Look at data block locality or a more complex load to do this MinMaxPriorityQueue<RegionPlan> regionsToMove = MinMaxPriorityQueue.orderedBy(rpComparator).create(); List<RegionPlan> regionsToReturn = new ArrayList<RegionPlan>(); // Walk down most loaded, pruning each to the max int serversOverloaded = 0; // flag used to fetch regions from head and tail of list, alternately boolean fetchFromTail = false; Map<ServerName, BalanceInfo> serverBalanceInfo = new TreeMap<ServerName, BalanceInfo>(); for (Map.Entry<ServerAndLoad, List<HRegionInfo>> server: serversByLoad.descendingMap().entrySet()) { ServerAndLoad sal = server.getKey(); int regionCount = sal.getLoad(); if (regionCount <= max) { serverBalanceInfo.put(sal.getServerName(), new BalanceInfo(0, 0)); break; } serversOverloaded++; List<HRegionInfo> regions = server.getValue(); int numToOffload = Math.min(regionCount - max, regions.size()); // account for the out-of-band regions which were assigned to this server // after some other region server crashed Collections.sort(regions, riComparator); int numTaken = 0; for (int i = 0; i <= numToOffload; ) { HRegionInfo hri = regions.get(i); // fetch from head if (fetchFromTail) { hri = regions.get(regions.size() - 1 - i); } i++; // Don't rebalance meta regions. if (hri.isMetaRegion()) continue; regionsToMove.add(new RegionPlan(hri, sal.getServerName(), null)); numTaken++; if (numTaken >= numToOffload) break; // fetch in alternate order if there is new region server if (emptyRegionServerPresent) { fetchFromTail = !fetchFromTail; } } serverBalanceInfo.put(sal.getServerName(), new BalanceInfo(numToOffload, (-1)*numTaken)); } int totalNumMoved = regionsToMove.size(); // Walk down least loaded, filling each to the min int neededRegions = 0; // number of regions needed to bring all up to min fetchFromTail = false; Map<ServerName, Integer> underloadedServers = new HashMap<ServerName, Integer>(); float average = (float)numRegions / numServers; // for logging int maxToTake = numRegions - (int)average; for (Map.Entry<ServerAndLoad, List<HRegionInfo>> server: serversByLoad.entrySet()) { if (maxToTake == 0) break; // no more to take int regionCount = server.getKey().getLoad(); if (regionCount >= min && regionCount > 0) { continue; // look for other servers which haven't reached min } int regionsToPut = min - regionCount; if (regionsToPut == 0) { regionsToPut = 1; maxToTake--; } underloadedServers.put(server.getKey().getServerName(), regionsToPut); } // number of servers that get new regions int serversUnderloaded = underloadedServers.size(); int incr = 1; List<ServerName> sns = Arrays.asList(underloadedServers.keySet().toArray(new ServerName[serversUnderloaded])); Collections.shuffle(sns, RANDOM); while (regionsToMove.size() > 0) { int cnt = 0; int i = incr > 0 ? 0 : underloadedServers.size()-1; for (; i >= 0 && i < underloadedServers.size(); i += incr) { if (regionsToMove.isEmpty()) break; ServerName si = sns.get(i); int numToTake = underloadedServers.get(si); if (numToTake == 0) continue; addRegionPlan(regionsToMove, fetchFromTail, si, regionsToReturn); if (emptyRegionServerPresent) { fetchFromTail = !fetchFromTail; } underloadedServers.put(si, numToTake-1); cnt++; BalanceInfo bi = serverBalanceInfo.get(si); if (bi == null) { bi = new BalanceInfo(0, 0); serverBalanceInfo.put(si, bi); } bi.setNumRegionsAdded(bi.getNumRegionsAdded()+1); } if (cnt == 0) break; // iterates underloadedServers in the other direction incr = -incr; } for (Integer i : underloadedServers.values()) { // If we still want to take some, increment needed neededRegions += i; } // If none needed to fill all to min and none left to drain all to max, // we are done if (neededRegions == 0 && regionsToMove.isEmpty()) { long endTime = System.currentTimeMillis(); LOG.info("Calculated a load balance in " + (endTime-startTime) + "ms. " + "Moving " + totalNumMoved + " regions off of " + serversOverloaded + " overloaded servers onto " + serversUnderloaded + " less loaded servers"); return regionsToReturn; } // Need to do a second pass. // Either more regions to assign out or servers that are still underloaded // If we need more to fill min, grab one from each most loaded until enough if (neededRegions != 0) { // Walk down most loaded, grabbing one from each until we get enough for (Map.Entry<ServerAndLoad, List<HRegionInfo>> server : serversByLoad.descendingMap().entrySet()) { BalanceInfo balanceInfo = serverBalanceInfo.get(server.getKey().getServerName()); int idx = balanceInfo == null ? 0 : balanceInfo.getNextRegionForUnload(); if (idx >= server.getValue().size()) break; HRegionInfo region = server.getValue().get(idx); if (region.isMetaRegion()) continue; // Don't move meta regions. regionsToMove.add(new RegionPlan(region, server.getKey().getServerName(), null)); totalNumMoved++; if (--neededRegions == 0) { // No more regions needed, done shedding break; } } } // Now we have a set of regions that must be all assigned out // Assign each underloaded up to the min, then if leftovers, assign to max // Walk down least loaded, assigning to each to fill up to min for (Map.Entry<ServerAndLoad, List<HRegionInfo>> server : serversByLoad.entrySet()) { int regionCount = server.getKey().getLoad(); if (regionCount >= min) break; BalanceInfo balanceInfo = serverBalanceInfo.get(server.getKey().getServerName()); if(balanceInfo != null) { regionCount += balanceInfo.getNumRegionsAdded(); } if(regionCount >= min) { continue; } int numToTake = min - regionCount; int numTaken = 0; while(numTaken < numToTake && 0 < regionsToMove.size()) { addRegionPlan(regionsToMove, fetchFromTail, server.getKey().getServerName(), regionsToReturn); numTaken++; if (emptyRegionServerPresent) { fetchFromTail = !fetchFromTail; } } } // If we still have regions to dish out, assign underloaded to max if (0 < regionsToMove.size()) { for (Map.Entry<ServerAndLoad, List<HRegionInfo>> server : serversByLoad.entrySet()) { int regionCount = server.getKey().getLoad(); if(regionCount >= max) { break; } addRegionPlan(regionsToMove, fetchFromTail, server.getKey().getServerName(), regionsToReturn); if (emptyRegionServerPresent) { fetchFromTail = !fetchFromTail; } if (regionsToMove.isEmpty()) { break; } } } long endTime = System.currentTimeMillis(); if (!regionsToMove.isEmpty() || neededRegions != 0) { // Emit data so can diagnose how balancer went astray. LOG.warn("regionsToMove=" + totalNumMoved + ", numServers=" + numServers + ", serversOverloaded=" + serversOverloaded + ", serversUnderloaded=" + serversUnderloaded); StringBuilder sb = new StringBuilder(); for (Map.Entry<ServerName, List<HRegionInfo>> e: clusterMap.entrySet()) { if (sb.length() > 0) sb.append(", "); sb.append(e.getKey().toString()); sb.append(" "); sb.append(e.getValue().size()); } LOG.warn("Input " + sb.toString()); } // All done! LOG.info("Done. Calculated a load balance in " + (endTime-startTime) + "ms. " + "Moving " + totalNumMoved + " regions off of " + serversOverloaded + " overloaded servers onto " + serversUnderloaded + " less loaded servers"); return regionsToReturn; } /** * Add a region from the head or tail to the List of regions to return. */ private void addRegionPlan(final MinMaxPriorityQueue<RegionPlan> regionsToMove, final boolean fetchFromTail, final ServerName sn, List<RegionPlan> regionsToReturn) { RegionPlan rp = null; if (!fetchFromTail) rp = regionsToMove.remove(); else rp = regionsToMove.removeLast(); rp.setDestination(sn); regionsToReturn.add(rp); } }