/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hdfs.server.datanode.fsdataset; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_THRESHOLD_DEFAULT; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_THRESHOLD_KEY; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_PREFERENCE_FRACTION_DEFAULT; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_PREFERENCE_FRACTION_KEY; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Random; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configurable; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.util.DiskChecker.DiskOutOfSpaceException; /** * A DN volume choosing policy which takes into account the amount of free * space on each of the available volumes when considering where to assign a * new replica allocation. By default this policy prefers assigning replicas to * those volumes with more available free space, so as to over time balance the * available space of all the volumes within a DN. */ public class AvailableSpaceVolumeChoosingPolicy<V extends FsVolumeSpi> implements VolumeChoosingPolicy<V>, Configurable { private static final Log LOG = LogFactory.getLog(AvailableSpaceVolumeChoosingPolicy.class); private static final Random RAND = new Random(); private long balancedSpaceThreshold = DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_THRESHOLD_DEFAULT; private float balancedPreferencePercent = DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_PREFERENCE_FRACTION_DEFAULT; @Override public synchronized void setConf(Configuration conf) { balancedSpaceThreshold = conf.getLong( DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_THRESHOLD_KEY, DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_THRESHOLD_DEFAULT); balancedPreferencePercent = conf.getFloat( DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_PREFERENCE_FRACTION_KEY, DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_PREFERENCE_FRACTION_DEFAULT); LOG.info("Available space volume choosing policy initialized: " + DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_THRESHOLD_KEY + " = " + balancedSpaceThreshold + ", " + DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_PREFERENCE_FRACTION_KEY + " = " + balancedPreferencePercent); if (balancedPreferencePercent > 1.0) { LOG.warn("The value of " + DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_PREFERENCE_FRACTION_KEY + " is greater than 1.0 but should be in the range 0.0 - 1.0"); } if (balancedPreferencePercent < 0.5) { LOG.warn("The value of " + DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_PREFERENCE_FRACTION_KEY + " is less than 0.5 so volumes with less available disk space will receive more block allocations"); } } @Override public synchronized Configuration getConf() { // Nothing to do. Only added to fulfill the Configurable contract. return null; } private final VolumeChoosingPolicy<V> roundRobinPolicyBalanced = new RoundRobinVolumeChoosingPolicy<V>(); private final VolumeChoosingPolicy<V> roundRobinPolicyHighAvailable = new RoundRobinVolumeChoosingPolicy<V>(); private final VolumeChoosingPolicy<V> roundRobinPolicyLowAvailable = new RoundRobinVolumeChoosingPolicy<V>(); @Override public synchronized V chooseVolume(List<V> volumes, final long replicaSize) throws IOException { if (volumes.size() < 1) { throw new DiskOutOfSpaceException("No more available volumes"); } AvailableSpaceVolumeList volumesWithSpaces = new AvailableSpaceVolumeList(volumes); if (volumesWithSpaces.areAllVolumesWithinFreeSpaceThreshold()) { // If they're actually not too far out of whack, fall back on pure round // robin. V volume = roundRobinPolicyBalanced.chooseVolume(volumes, replicaSize); if (LOG.isDebugEnabled()) { LOG.debug("All volumes are within the configured free space balance " + "threshold. Selecting " + volume + " for write of block size " + replicaSize); } return volume; } else { V volume = null; // If none of the volumes with low free space have enough space for the // replica, always try to choose a volume with a lot of free space. long mostAvailableAmongLowVolumes = volumesWithSpaces .getMostAvailableSpaceAmongVolumesWithLowAvailableSpace(); List<V> highAvailableVolumes = extractVolumesFromPairs( volumesWithSpaces.getVolumesWithHighAvailableSpace()); List<V> lowAvailableVolumes = extractVolumesFromPairs( volumesWithSpaces.getVolumesWithLowAvailableSpace()); float preferencePercentScaler = (highAvailableVolumes.size() * balancedPreferencePercent) + (lowAvailableVolumes.size() * (1 - balancedPreferencePercent)); float scaledPreferencePercent = (highAvailableVolumes.size() * balancedPreferencePercent) / preferencePercentScaler; if (mostAvailableAmongLowVolumes < replicaSize || RAND.nextFloat() < scaledPreferencePercent) { volume = roundRobinPolicyHighAvailable.chooseVolume( highAvailableVolumes, replicaSize); if (LOG.isDebugEnabled()) { LOG.debug("Volumes are imbalanced. Selecting " + volume + " from high available space volumes for write of block size " + replicaSize); } } else { volume = roundRobinPolicyLowAvailable.chooseVolume( lowAvailableVolumes, replicaSize); if (LOG.isDebugEnabled()) { LOG.debug("Volumes are imbalanced. Selecting " + volume + " from low available space volumes for write of block size " + replicaSize); } } return volume; } } /** * Used to keep track of the list of volumes we're choosing from. */ private class AvailableSpaceVolumeList { private final List<AvailableSpaceVolumePair> volumes; public AvailableSpaceVolumeList(List<V> volumes) throws IOException { this.volumes = new ArrayList<AvailableSpaceVolumePair>(); for (V volume : volumes) { this.volumes.add(new AvailableSpaceVolumePair(volume)); } } /** * Check if the available space on all the volumes is roughly equal. * * @param volumes the volumes to check * @return true if all volumes' free space is within the configured threshold, * false otherwise. * @throws IOException * in the event of error checking amount of available space */ public boolean areAllVolumesWithinFreeSpaceThreshold() { long leastAvailable = Long.MAX_VALUE; long mostAvailable = 0; for (AvailableSpaceVolumePair volume : volumes) { leastAvailable = Math.min(leastAvailable, volume.getAvailable()); mostAvailable = Math.max(mostAvailable, volume.getAvailable()); } return (mostAvailable - leastAvailable) < balancedSpaceThreshold; } /** * @return the minimum amount of space available on a single volume, * across all volumes. */ private long getLeastAvailableSpace() { long leastAvailable = Long.MAX_VALUE; for (AvailableSpaceVolumePair volume : volumes) { leastAvailable = Math.min(leastAvailable, volume.getAvailable()); } return leastAvailable; } /** * @return the maximum amount of space available across volumes with low space. */ public long getMostAvailableSpaceAmongVolumesWithLowAvailableSpace() { long mostAvailable = Long.MIN_VALUE; for (AvailableSpaceVolumePair volume : getVolumesWithLowAvailableSpace()) { mostAvailable = Math.max(mostAvailable, volume.getAvailable()); } return mostAvailable; } /** * @return the list of volumes with relatively low available space. */ public List<AvailableSpaceVolumePair> getVolumesWithLowAvailableSpace() { long leastAvailable = getLeastAvailableSpace(); List<AvailableSpaceVolumePair> ret = new ArrayList<AvailableSpaceVolumePair>(); for (AvailableSpaceVolumePair volume : volumes) { if (volume.getAvailable() <= leastAvailable + balancedSpaceThreshold) { ret.add(volume); } } return ret; } /** * @return the list of volumes with a lot of available space. */ public List<AvailableSpaceVolumePair> getVolumesWithHighAvailableSpace() { long leastAvailable = getLeastAvailableSpace(); List<AvailableSpaceVolumePair> ret = new ArrayList<AvailableSpaceVolumePair>(); for (AvailableSpaceVolumePair volume : volumes) { if (volume.getAvailable() > leastAvailable + balancedSpaceThreshold) { ret.add(volume); } } return ret; } } /** * Used so that we only check the available space on a given volume once, at * the beginning of {@link AvailableSpaceVolumeChoosingPolicy#chooseVolume(List, long)}. */ private class AvailableSpaceVolumePair { private final V volume; private final long availableSpace; public AvailableSpaceVolumePair(V volume) throws IOException { this.volume = volume; this.availableSpace = volume.getAvailable(); } public long getAvailable() { return availableSpace; } public V getVolume() { return volume; } } private List<V> extractVolumesFromPairs(List<AvailableSpaceVolumePair> volumes) { List<V> ret = new ArrayList<V>(); for (AvailableSpaceVolumePair volume : volumes) { ret.add(volume.getVolume()); } return ret; } }