/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.server.balancer;
import java.util.Arrays;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.PriorityQueue;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.server.balancer.Balancer.BalancerDatanode;
import org.apache.hadoop.hdfs.server.balancer.Balancer.NodeTask;
import org.apache.hadoop.hdfs.server.balancer.Balancer.Source;
import org.apache.hadoop.hdfs.server.balancer.Balancer.Target;
import org.apache.hadoop.net.NetworkTopology;
import org.apache.hadoop.net.Node;
/** Keeps the plan of pipelines to create between nodes and how much data must be sent */
class BalancePlan {
protected static final Log LOG = LogFactory.getLog(BalancePlan.class.getName());
/** Number of bytes to be moved in order to make the cluster balanced. */
public long bytesLeftToMove;
public long bytesToMove;
public NetworkTopology cluster = new NetworkTopology();
/** Maps datanode's storage ID to itself */
public Map<String, BalancerDatanode> datanodes = new HashMap<String, BalancerDatanode>();
/** All nodes that will participate in balancing as sources */
public Collection<Source> sources = new HashSet<Source>();
/** All nodes that will participate in balancing as targets */
public Collection<Target> targets = new HashSet<Target>();
/** If remaining < lowerRemainingThreshold then DataNode is considered overutilized */
private double lowerRemainingThreshold;
/** If remaining > upperRemainingThreshold then DataNode is considered underutilized */
private double upperRemainingThreshold;
/** Cluster-wide remaining capacity percentage */
private double avgRemaining;
/** Compute balance plan */
public BalancePlan(Balancer balancer, List<DatanodeInfo> datanodes) {
if (datanodes == null || datanodes.isEmpty()) {
throw new IllegalArgumentException("cannot prepare plan for empty cluster");
}
avgRemaining = computeAvgRemaining(datanodes);
lowerRemainingThreshold = Math.max(avgRemaining / 2, avgRemaining - balancer.threshold);
upperRemainingThreshold = Math.min(PERCENTAGE_BASE, avgRemaining + balancer.threshold);
if (lowerRemainingThreshold > upperRemainingThreshold) {
throw new IllegalStateException("lowerThresh > upperThresh");
}
LOG.info("balanced range: [ " + lowerRemainingThreshold + ", " + upperRemainingThreshold
+ " ], average remaining: " + avgRemaining);
long overLoadedBytes = 0L, underLoadedBytes = 0L;
Bucket clusterBucket = new Bucket();
Map<Node, Bucket> rackBuckets = new HashMap<Node, Bucket>();
for (DatanodeInfo datanode : datanodes) {
// Update network topology
cluster.add(datanode);
// Create bucket if none
assert datanode.getParent() != null : "node outside of any rack";
Bucket bucket = rackBuckets.get(datanode.getParent());
if (bucket == null) {
bucket = new Bucket();
rackBuckets.put(datanode.getParent(), bucket);
}
// Put DataNode into chosen bucket
BalancerDatanode datanodeS;
if (getRemaining(datanode) < avgRemaining) {
// Above average utilized
datanodeS = balancer.getSource(datanode, avgRemaining);
bucket.addSource((Source) datanodeS);
clusterBucket.addSource((Source) datanodeS);
if (isOverUtilized(datanodeS)) {
overLoadedBytes += (long) ((lowerRemainingThreshold - datanodeS.getCurrentRemaining())
* datanodeS.getDatanode().getCapacity() / PERCENTAGE_BASE);
}
} else {
// Below average utilized
datanodeS = new Target(datanode, avgRemaining);
bucket.addTarget((Target) datanodeS);
clusterBucket.addTarget((Target) datanodeS);
if (isUnderUtilized(datanodeS)) {
underLoadedBytes += (long) ((datanodeS.getCurrentRemaining() - upperRemainingThreshold)
* datanodeS.getDatanode().getCapacity() / PERCENTAGE_BASE);
}
}
// Update all DataNodes list
this.datanodes.put(datanode.getStorageID(), datanodeS);
}
bytesLeftToMove = Math.max(overLoadedBytes, underLoadedBytes);
logImbalancedNodes();
// Balance each rack bucket separately
for (Bucket bucket : rackBuckets.values()) {
double rackAverage = bucket.computeAvgRemaining();
if (lowerRemainingThreshold <= rackAverage && rackAverage <= upperRemainingThreshold) {
bucket.updatePlan();
}
// If perfectly balanced rack renders only over or underutilized DataNodes
// we do not bother balancing it
}
// Balance cluster-wide afterwards
clusterBucket.externalUpdate();
clusterBucket.updatePlan();
bytesToMove = 0L;
for (Source src : sources) {
bytesToMove += src.scheduledSize;
}
logPlanOutcome();
}
/** Log the over utilized & under utilized nodes */
private void logImbalancedNodes() {
if (LOG.isInfoEnabled()) {
int underUtilized = 0, overUtilized = 0;
for (BalancerDatanode node : this.datanodes.values()) {
if (isUnderUtilized(node))
underUtilized++;
else if (isOverUtilized(node))
overUtilized++;
}
StringBuilder msg = new StringBuilder();
msg.append(overUtilized);
msg.append(" over utilized nodes:");
for (BalancerDatanode node : this.datanodes.values()) {
if (isOverUtilized(node)) {
msg.append(" ");
msg.append(node.getName());
}
}
LOG.info(msg);
msg = new StringBuilder();
msg.append(underUtilized);
msg.append(" under utilized nodes: ");
for (BalancerDatanode node : this.datanodes.values()) {
if (isUnderUtilized(node)) {
msg.append(" ");
msg.append(node.getName());
}
}
LOG.info(msg);
}
}
/** Log node utilization after the plan execution */
private void logPlanOutcome() {
if (LOG.isInfoEnabled()) {
LOG.info("Predicted plan outcome: bytesLeftToMove: "
+ bytesLeftToMove + ", bytesToMove: " + bytesToMove);
for (BalancerDatanode node : this.datanodes.values()) {
LOG.info(node.getName() + " remaining: " + node.getCurrentRemaining());
}
}
}
/** Pairs up given nodes in balancing plan */
private void scheduleTask(Source source, long size, Target target) {
NodeTask nodeTask = new NodeTask(target, size);
source.addNodeTask(nodeTask);
target.addNodeTask(nodeTask);
sources.add(source);
targets.add(target);
LOG.info("scheduled " + size + " bytes : " + source.getName() + " -> " + target.getName());
}
/** Determines if the node is overutilized */
private boolean isOverUtilized(BalancerDatanode datanode) {
return datanode.getCurrentRemaining() < lowerRemainingThreshold;
}
/** Determines if the node is underutilized */
private boolean isUnderUtilized(BalancerDatanode datanode) {
return datanode.getCurrentRemaining() > upperRemainingThreshold;
}
/** True iff the DataNode was over or underutilized before balancing */
private boolean wasUrgent(BalancerDatanode datanode) {
// Note that no node can become urgent during balancing if it was not before
return datanode.initialRemaining < lowerRemainingThreshold || datanode.initialRemaining > upperRemainingThreshold;
}
/** Remaining ratio is expressed in percents */
static final double PERCENTAGE_BASE = 100.0;
static double computeAvgRemaining(Iterable<DatanodeInfo> datanodes) {
long totalCapacity = 0L, totalRemainingSpace = 0L;
for (DatanodeInfo datanode : datanodes) {
totalCapacity += datanode.getCapacity();
totalRemainingSpace += datanode.getRemaining();
}
return (double) totalRemainingSpace / totalCapacity * PERCENTAGE_BASE;
}
static double getRemaining(DatanodeInfo datanode) {
return (double) datanode.getRemaining() / datanode.getCapacity() * PERCENTAGE_BASE;
}
/** Set of nodes which can interchange data for balancing */
private class Bucket {
private PriorityQueue<Source> sources = new PriorityQueue<Source>(10, new SourceComparator());
private PriorityQueue<Target> targets = new PriorityQueue<Target>(10, new TargetComparator());
public void addSource(Source node) {
this.sources.add(node);
}
public void addTarget(Target node) {
this.targets.add(node);
}
public double computeAvgRemaining() {
long totalCapacity = 0L, totalRemainingSpace = 0L;
for (BalancerDatanode node : sources) {
totalCapacity += node.getDatanode().getCapacity();
totalRemainingSpace += node.getDatanode().getRemaining();
}
for (BalancerDatanode node : targets) {
totalCapacity += node.getDatanode().getCapacity();
totalRemainingSpace += node.getDatanode().getRemaining();
}
return ((double) totalRemainingSpace) / totalCapacity * PERCENTAGE_BASE;
}
/** Updates the plan with all pairs of nodes from this bucket which need to be connected */
public void updatePlan() {
while (!this.sources.isEmpty() && !this.targets.isEmpty()) {
Source source = this.sources.poll();
Target target = this.targets.poll();
if (!wasUrgent(source) && !wasUrgent(target)) {
// Due to ordering of DataNodes we can skip the rest
break;
}
long size = moveSize(source, target);
if (size > 0) {
scheduleTask(source, size, target);
}
if (source.getAvailableMoveSize() > 0) {
this.sources.add(source);
}
if (target.getAvailableMoveSize() > 0) {
this.targets.add(target);
}
// Loop termination:
// In each step we either scheduleTask, therefore decreasing sum (over
// all nodes) of availableMoveSize, or decrease number of nodes in
// sources or targets queue, all of them are bounded by 0.
}
}
/** Determines how much data to move between given nodes */
private long moveSize(Source source, BalancerDatanode target) {
// TODO balancing concurrency
return Math.min(source.getAvailableMoveSize(), target.getAvailableMoveSize());
}
/** Sort internal queues again in case DataNodes was changed externally */
public void externalUpdate() {
// sources and targets might no longer be a proper priority queues
this.sources = new PriorityQueue<Source>((Collection<Source>) this.sources);
this.targets = new PriorityQueue<Target>((Collection<Target>) this.targets);
}
/**
* We rely on this ordering in Bucket#updatePlan loop termination condition,
* additional priorities should be expressed in proper (source/target) comparator below.
* Because of this condition SourceComparator and TargetComparator are not reverse of each
* other.
*/
private abstract class BalancerDatanodeComparator implements Comparator<BalancerDatanode> {
@Override
public int compare(BalancerDatanode o1, BalancerDatanode o2) {
return Boolean.valueOf(wasUrgent(o2)).compareTo(wasUrgent(o1));
}
}
private final class SourceComparator extends BalancerDatanodeComparator {
@Override
public int compare(BalancerDatanode o1, BalancerDatanode o2) {
int ret = super.compare(o1, o2);
if (ret == 0) {
ret = Double.valueOf(o1.getCurrentRemaining()).compareTo(o2.getCurrentRemaining());
}
// TODO concurrency level can also be taken into consideration
return ret;
}
}
private final class TargetComparator extends BalancerDatanodeComparator {
@Override
public int compare(BalancerDatanode o1, BalancerDatanode o2) {
int ret = super.compare(o1, o2);
if (ret == 0) {
ret = Double.valueOf(o2.getCurrentRemaining()).compareTo(o1.getCurrentRemaining());
}
// TODO concurrency level can also be taken into consideration
return ret;
}
}
}
/** Prints data distribution based on report from NameNode */
public static void logDataDistribution(DatanodeInfo[] report) {
if (LOG.isInfoEnabled()) {
double avgRemaining = computeAvgRemaining(Arrays.asList(report));
StringBuilder msg = new StringBuilder("Data distribution report: avgRemaining "
+ avgRemaining);
for (DatanodeInfo node : report) {
msg.append("\n").append(node.getName());
msg.append(" remaining ").append(getRemaining(node));
msg.append(" raw ").append(node.getRemaining()).append(" / ").append(node.getCapacity());
}
LOG.info(msg);
}
}
}