/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.server.namenode;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.server.namenode.BlocksMap.BlockInfo;
import org.apache.hadoop.net.DNSToSwitchMapping;
import org.apache.hadoop.net.NetworkTopology;
import org.apache.hadoop.util.HostsFileReader;
public class BlockPlacementPolicyHBase extends BlockPlacementPolicyConfigurable {
private static final int ITERATE_THROUGH_BLOCKS_THRESHOLD = 500;
private static final String MD5_CODE_REGEX = "[[a-f][0-9]]{32}";
private static final String HBASE_DIRECTORY_REGEX = ".*HBASE";
private static final String TABLE_NAME_REGEX = "[[a-z][A-Z][0-9]_][[a-z][A-Z][0-9]_\\.\\-]*";
private static final String REGION_NAME_REGEX = MD5_CODE_REGEX;
private static final String COLUMN_FAMILY_NAME_REGEX = "[^\\.:][^:]*";
private static final String HFILE_REGEX = MD5_CODE_REGEX;
private static final String HBASE_FILE_REGEX = Path.SEPARATOR + HBASE_DIRECTORY_REGEX + Path.SEPARATOR
+ TABLE_NAME_REGEX + Path.SEPARATOR + REGION_NAME_REGEX + Path.SEPARATOR
+ COLUMN_FAMILY_NAME_REGEX + Path.SEPARATOR + HFILE_REGEX; // /*HBASE/TableName/RegionName/ColumnFamily/HFile
private FSNamesystem nameSystem;
BlockPlacementPolicyHBase() {
}
/** {@inheritDoc} */
public void initialize(Configuration conf, FSClusterStats stats, NetworkTopology clusterMap,
HostsFileReader hostsReader, DNSToSwitchMapping dnsToSwitchMapping, FSNamesystem ns) {
this.nameSystem = ns;
super.initialize(conf, stats, clusterMap, hostsReader, dnsToSwitchMapping, ns);
}
/**
* For HBase, we try not to delete from datanodes that were originally from
* the favored nodes, but, since, the original favored nodes are not stored
* anywhere, our strategy is to iterate through files in the same directory,
* choose the least frequently used datanode for deletion.
* However since there might be a large number of datanodes in the same
* directory, we only iterate through a particular number of blocks (
* {@link #ITERATE_THROUGH_BLOCKS_THRESHOLD}).
*
* Also, since, favored nodes may change over time, we iterate through
* files in the reverse creation order.
*/
public DatanodeDescriptor chooseReplicaToDelete(FSInodeInfo inode, Block block,
short replicationFactor, Collection<DatanodeDescriptor> first,
Collection<DatanodeDescriptor> second) {
try {
if (!(inode instanceof INodeFile) || !inode.getFullPathName().matches(HBASE_FILE_REGEX)) {
return super.chooseReplicaToDelete(inode, block, replicationFactor, first, second);
}
} catch (IOException e) {
if (NameNode.LOG.isDebugEnabled()) {
NameNode.LOG.debug("Couldn't get full path name. " + e.getMessage());
}
return super.chooseReplicaToDelete(inode, block, replicationFactor, first, second);
}
INodeFile inodeFile = (INodeFile) inode;
INodeDirectory parent = inodeFile.getParent();
if (parent == null) { // Probably it's renamed or removed
return super.chooseReplicaToDelete(inodeFile, block, replicationFactor, first, second);
}
// A map from datanodes to the number of usages in the same directory.
HashMap<DatanodeDescriptor, Integer> dataNodeUsage = directoryDataNodeUsage(parent,
ITERATE_THROUGH_BLOCKS_THRESHOLD);
// Now pick the datanode with the least usage. In case of equal
// usages, priority is to pick the one from the first array.
DatanodeDescriptor minUsageInstance = getMinUsage(first, Integer.MAX_VALUE, null, dataNodeUsage);
// Get the value of minimum usage from first nodes to compare with second
// nodes
int minUsage = (minUsageInstance != null) ? dataNodeUsage.get(minUsageInstance)
: Integer.MAX_VALUE;
// To avoid putting all nodes in the same rack:
if (minUsageInstance == null || second.size() > 1 || nodeOnMultipleRacks(first)) {
minUsageInstance = getMinUsage(second, minUsage, minUsageInstance, dataNodeUsage);
}
return minUsageInstance;
}
/**
* Checks that all the nodes in <code>nodes</code> are from the same rack or
* not.
* Result will be false only if all the nodes are from the same rack,
* otherwise true.
*/
private boolean nodeOnMultipleRacks(Collection<DatanodeDescriptor> nodes) {
DatanodeDescriptor previous = null;
for (DatanodeDescriptor node : nodes) {
if (previous != null && !previous.getNetworkLocation().equals(node.getNetworkLocation())) {
return true;
}
previous = node;
}
return false;
}
/**
* Finds DatanodeDescriptor from the list <code>nodes</code> where its value
* in <code>usageMap</code> is the minimum, if the minimum value is greater
* than or equal to <code>minUsageDefault</code>,
* <code>minUsageInstanceDefault</code> will be returned, otherwise the
* DatanodeDescriptor with the minimum value will be returned.
*/
private DatanodeDescriptor getMinUsage(Collection<DatanodeDescriptor> nodes, int minUsageDefault,
DatanodeDescriptor minUsageInstanceDefault, HashMap<DatanodeDescriptor, Integer> usageMap) {
for (DatanodeDescriptor dnd : nodes) {
Integer usage = usageMap.get(dnd);
if (usage == null) {
usage = 0;
}
if (usage < minUsageDefault) {
minUsageDefault = usage;
minUsageInstanceDefault = dnd;
}
}
return minUsageInstanceDefault;
}
/**
* Iterates through files in the directory dir, and counts the number
* of usages of datanodes for the files in the directory dir.
* Just iterates through threshold number of blocks.
*/
private HashMap<DatanodeDescriptor, Integer> directoryDataNodeUsage(INodeDirectory dir,
int threshold) {
HashMap<DatanodeDescriptor, Integer> dataNodeUsage = new HashMap<DatanodeDescriptor, Integer>();
List<INode> children;
nameSystem.readLock();
try {
if (dir.getChildrenRaw() == null) {
return dataNodeUsage;
}
children = new ArrayList<INode>(dir.getChildrenRaw());
Collections.shuffle(children);
for (INode node : children) {
if (!(node instanceof INodeFile)) { // The condition is always false.
continue;
}
INodeFile file = (INodeFile) node;
BlockInfo[] blocks = file.getBlocks();
for (BlockInfo block : blocks) {
if (threshold == 0) {
return dataNodeUsage;
}
int replication = block.numNodes();
for (int i = 0; i < replication; i++) {
DatanodeDescriptor datanode = block.getDatanode(i);
Integer currentUsage = dataNodeUsage.get(datanode);
dataNodeUsage.put(datanode, currentUsage == null ? 1 : currentUsage + 1);
}
threshold--;
}
}
} finally {
nameSystem.readUnlock();
}
return dataNodeUsage;
}
}