/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.server.namenode;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Random;
import java.util.Set;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.net.DNSToSwitchMapping;
import org.apache.hadoop.net.NetworkTopology;
import org.apache.hadoop.net.Node;
import org.apache.hadoop.net.NodeBase;
import org.apache.hadoop.util.HostsFileReader;
public class BlockPlacementPolicyConfigurable extends
BlockPlacementPolicyDefault {
/**
* RackRingInfo contains the rack information with respect to the ring it
* belongs to and the internal machine ring it keeps.
*/
protected class RackRingInfo {
public int index;
public List<String> rackNodes;
public HashMap<String,Integer> rackNodesMap;
public Integer findNode(DatanodeDescriptor node) {
Integer retVal = rackNodesMap.get(node.getHostName());
if (retVal == null) {
retVal = rackNodesMap.get(node.getName());
if (retVal == null) {
retVal = rackNodesMap.get(node.getHost());
if (retVal == null) {
LOG.info("Didn't find " + node.getHostName() +
" - " + node.getName() + " - " + node.getHost());
}
}
}
return retVal;
}
}
public static final Log LOG =
LogFactory.getLog(BlockPlacementPolicyConfigurable.class);
// a fair rw lock for racks and racksMap.
ReentrantReadWriteLock rwLock = new ReentrantReadWriteLock(true);
protected List<String> racks; // Ring of racks
protected HashMap<String,RackRingInfo> racksMap; //RackRingInfo map
protected int rackWindow;
protected int machineWindow;
Random r = null;
HostsFileReader hostsReader;
DNSToSwitchMapping dnsToSwitchMapping;
BlockPlacementPolicyConfigurable(Configuration conf, FSClusterStats stats,
NetworkTopology clusterMap, HostsFileReader hostsReader,
DNSToSwitchMapping dnsToSwitchMapping) {
initialize(conf, stats, clusterMap, hostsReader, dnsToSwitchMapping, null);
}
BlockPlacementPolicyConfigurable() {
}
BlockPlacementPolicyConfigurable(long seed) {
r = new Random(seed);
}
private class HashComparator implements Comparator<String> {
Random rand = new Random();
public int compare(String o1, String o2) {
rand.setSeed(o1.hashCode());
int hc1 = rand.nextInt();
rand.setSeed(o2.hashCode());
int hc2 = rand.nextInt();
if (hc1 < hc2) return -1;
if (hc1 > hc2) return 1;
return 0;
}
}
private void readLock() {
rwLock.readLock().lock();
}
private void readUnlock() {
rwLock.readLock().unlock();
}
private void writeLock() {
rwLock.writeLock().lock();
}
private void writeUnlock() {
rwLock.writeLock().unlock();
}
/**
* The two classes below are used to hash rack and host names when forming
* the rings. They can be overwritten to implement different strategies
*/
protected Comparator<String> rackComparator = new HashComparator();
protected Comparator<String> hostComparator = new HashComparator();
/** {@inheritDoc} */
public void initialize(Configuration conf, FSClusterStats stats,
NetworkTopology clusterMap, HostsFileReader hostsReader,
DNSToSwitchMapping dnsToSwitchMapping, FSNamesystem ns) {
super.initialize(conf, stats, clusterMap, hostsReader, dnsToSwitchMapping, ns);
this.rackWindow = conf.getInt("dfs.replication.rackwindow", 2);
this.machineWindow = conf.getInt("dfs.replication.machineWindow", 5);
this.racks = new ArrayList<String>();
this.hostsReader = hostsReader;
this.dnsToSwitchMapping = dnsToSwitchMapping;
hostsUpdated(true);
if (r == null) {
r = new Random();
}
LOG.info("BlockPlacementPolicyConfigurable initialized");
}
/** {@inheritDoc} */
public void hostsUpdated() {
hostsUpdated(false);
}
public void hostsUpdated(boolean startup) {
List<String> hostsIn = new ArrayList<String>(hostsReader.getHosts());
List<String> hostsRacks = dnsToSwitchMapping.resolve(hostsIn);
HashMap<String,RackRingInfo> tempRacksMap =
new HashMap<String,RackRingInfo>();
List<String> tempRacks = new ArrayList<String>();
int index = hostsRacks.indexOf(NetworkTopology.DEFAULT_RACK);
if (index != -1) {
if (!startup) {
throw new DefaultRackException("Could not resolve rack for : "
+ hostsIn.get(index) + " probably due to a DNS issue");
} else {
// We do not want to abort startup, just remove the bad datanode.
for (int i = 0; i < hostsRacks.size(); i++) {
if (hostsRacks.get(i).equals(NetworkTopology.DEFAULT_RACK)) {
LOG.warn("Could not resolve rack for : "
+ hostsIn.get(i) + " probably due to a DNS issue, removing"
+ " the host since we are in startup");
hostsRacks.remove(i);
hostsReader.getHosts().remove(hostsIn.get(i));
hostsIn.remove(i);
i--;
}
}
}
}
for (int i=0; i<hostsIn.size(); i++) {
String host = hostsIn.get(i);
String rack = hostsRacks.get(i);
RackRingInfo rackinfo = tempRacksMap.get(rack);
if (rackinfo == null) {
LOG.info("Adding rack:" + rack);
tempRacks.add(rack);
rackinfo = new RackRingInfo();
rackinfo.rackNodes = new ArrayList<String>();
tempRacksMap.put(rack, rackinfo);
}
LOG.info("Adding host:" + host);
rackinfo.rackNodes.add(host);
}
Collections.sort(tempRacks, rackComparator);
StringBuffer ringRep = new StringBuffer("\nRing Topology:\n");
for (int i = 0; i < tempRacks.size(); i++) {
RackRingInfo rackinfo = tempRacksMap.get(tempRacks.get(i));
rackinfo.index = i;
List<String> rackNodes = rackinfo.rackNodes;
HashMap<String,Integer> nodesMap = new HashMap<String,Integer>();
rackinfo.rackNodesMap = nodesMap;
ringRep.append("\tRing " + i + ": " + tempRacks.get(i) + "\n");
Collections.sort(rackNodes, hostComparator);
for (int j=0; j<rackNodes.size(); j++) {
ringRep.append("\t\t" + j + ": " + rackNodes.get(j) + "\n");
nodesMap.put(rackNodes.get(j), j);
}
}
LOG.info(ringRep.toString());
// Update both datastructures together in a lock.
writeLock();
racksMap = tempRacksMap;
racks = tempRacks;
writeUnlock();
}
/**
* returns a random integer within a modular window taking into consideration
* a sorted list of nodes to be excluded.
*/
protected int randomIntInWindow(int begin, int windowSize, int n,
Set<Integer> excludeSet) {
final int size = Math.min(windowSize, n);
if (size <= 0) {
return -1;
}
int adjustment = 0;
for (Integer v: excludeSet) {
int vindex = (v.intValue() - begin + n) % n;
if (vindex < size) {
adjustment++; // calculates excluded elements within window
}
}
if (adjustment >= size) {
return -1;
}
int rindex = r.nextInt(size - adjustment); // ith element is chosen
int iterator = begin;
for (int i = 0; i <= rindex; i++) {
while (excludeSet.contains(iterator)) {
iterator = (iterator + 1) % n;
}
if (i != rindex) {
iterator = (iterator + 1) % n;
}
}
return iterator;
}
/* choose <i>numOfReplicas</i> from all data nodes */
@Override
protected DatanodeDescriptor chooseTarget(int numOfReplicas,
DatanodeDescriptor writer, HashMap<Node, Node> excludedNodes,
long blocksize, int maxNodesPerRack, List<DatanodeDescriptor> results,
boolean newBlock) {
if (numOfReplicas == 0 || clusterMap.getNumOfLeaves() == 0) {
return writer;
}
int numOfResults = results.size();
if (writer == null && !newBlock) {
writer = results.get(0);
}
try {
if (numOfResults == 0) {
writer = chooseLocalNode(writer, excludedNodes, blocksize,
maxNodesPerRack, results);
if (--numOfReplicas == 0) {
return writer;
}
}
if (numOfResults <= 1) {
// If we have a replication factor of 2, place both replicas on the
// same rack.
if (numOfReplicas == 1) {
chooseLocalRack(results.get(0), excludedNodes, blocksize,
maxNodesPerRack, results);
} else {
chooseFirstInRemoteRack(results.get(0), excludedNodes, blocksize,
maxNodesPerRack, results);
}
if (--numOfReplicas == 0) {
return writer;
}
}
chooseRemainingReplicas(numOfReplicas, excludedNodes, blocksize,
maxNodesPerRack, results);
} catch (NotEnoughReplicasException e) {
LOG.warn("Not able to place enough replicas, still in need of "
+ numOfReplicas);
}
return writer;
}
/**
* Picks up the first replica stored in a remote rack.
* @param localMachine local machine that is writing the data
* @param excludedNodes nodes that should not be considered
* @param blocksize size of blocks
* @param maxReplicasPerRack maximum replicas per rack
* @param results datanodes used for replicas
* @throws NotEnoughReplicasException
*/
protected void chooseFirstInRemoteRack(DatanodeDescriptor localMachine,
HashMap<Node, Node> excludedNodes, long blocksize,
int maxReplicasPerRack, List<DatanodeDescriptor> results)
throws NotEnoughReplicasException {
readLock();
try {
RackRingInfo rackInfo = racksMap.get(localMachine.getNetworkLocation());
assert (rackInfo != null);
Integer machineId = rackInfo.findNode(localMachine);
assert (machineId != null);
if (!chooseRemoteRack(rackInfo.index, rackInfo.index, rackWindow + 1,
machineId, machineWindow, excludedNodes, blocksize,
maxReplicasPerRack, results, false)) {
LOG.info("Couldn't find a Datanode within node group. "
+ "Resorting to default policy.");
super.chooseRemoteRack(1, localMachine, excludedNodes, blocksize,
maxReplicasPerRack, results);
}
} finally {
readUnlock();
}
}
/**
* returns the best possible match of nodes to the first three replicas in
* in the current replication scheme
* result[0] = first replica or null
* result[1] = second replica or null
* result[2] = third replica or null
*/
protected DatanodeDescriptor[] findBest(List<DatanodeDescriptor> listOfNodes) {
DatanodeDescriptor[] result = new DatanodeDescriptor[3];
result[0] = listOfNodes.isEmpty() ? null : listOfNodes.get(0);
result[1] = null;
result[2] = null;
for (DatanodeDescriptor n : listOfNodes) {
findBestWithFirst(n, listOfNodes, result);
if (result[2] != null)
return result;
}
if (result[1] == null && listOfNodes.size() > 1) {
findBestWithoutFirst(listOfNodes, result);
}
return result;
}
/**
* Function that finds the best partial triple including a first replica
* @param first first replica to be used
* @param listOfNodes nodes to choose from
* @param result array that stores results
*/
private void findBestWithFirst(DatanodeDescriptor first,
List<DatanodeDescriptor> listOfNodes, DatanodeDescriptor[] result) {
for (int in2 = 0; in2 < listOfNodes.size(); in2++) {
DatanodeDescriptor n2 = listOfNodes.get(in2);
if (!first.equals(n2)) {
if (result[1] == null && inWindow(first, n2)) {
result[0] = first;
result[1] = n2;
}
for (int in3 = in2 + 1; in3 < listOfNodes.size(); in3++) {
DatanodeDescriptor n3 = listOfNodes.get(in3);
if (!first.equals(n3)
&& inWindow(first, n3, n2)) {
result[0] = first;
result[1] = n2;
result[2] = n3;
return;
}
}
}
}
}
/**
* Verifies if testing node is within right windows of first node
* @param first first node being considered
* @param testing node we are testing to check if it is within window or not
* @return We return true if it is successful, and not otherwise
*/
private boolean inWindow(DatanodeDescriptor first, DatanodeDescriptor testing) {
readLock();
try {
RackRingInfo rackInfo = racksMap.get(first.getNetworkLocation());
assert (rackInfo != null);
Integer machineId = rackInfo.findNode(first);
assert (machineId != null);
final int rackWindowStart = rackInfo.index;
final RackRingInfo rackTest = racksMap.get(testing.getNetworkLocation());
assert (rackTest != null);
final int rackDist = (rackTest.index - rackWindowStart + racks.size())
% racks.size();
if (rackDist < rackWindow + 1 && rackTest.index != rackInfo.index) {
// inside rack window
final Integer idFirst = rackInfo.findNode(first);
assert (idFirst != null);
final int sizeFirstRack = rackInfo.rackNodes.size();
final int sizeTestRack = rackTest.rackNodes.size();
final int start = idFirst * sizeTestRack / sizeFirstRack;
final Integer idTest = rackTest.findNode(testing);
assert (idTest != null);
final int dist = (idTest - start + sizeTestRack) % sizeTestRack;
if (dist < machineWindow) { // inside machine Window
return true;
}
}
return false;
} finally {
readUnlock();
}
}
/**
* Verifies if testing nodes are within right windows of first node
* @param first first node being considered
* @param testing1 node we are testing to check if it is within window or not
* @param testing2 node we are testing to check if it is within window or not
* @return We return true if it is successful, and not otherwise
*/
private boolean inWindow(DatanodeDescriptor first,
DatanodeDescriptor testing1, DatanodeDescriptor testing2) {
readLock();
try {
if (!testing1.getNetworkLocation().equals(testing2.getNetworkLocation())) {
return false;
}
RackRingInfo rackInfo = racksMap.get(first.getNetworkLocation());
assert (rackInfo != null);
Integer machineId = rackInfo.findNode(first);
assert (machineId != null);
final int rackWindowStart = rackInfo.index;
final RackRingInfo rackTest = racksMap.get(testing1.getNetworkLocation());
assert (rackTest != null);
final int rackDist = (rackTest.index - rackWindowStart + racks.size())
% racks.size();
if (rackDist < rackWindow + 1 && rackTest.index != rackInfo.index) {
// inside rack window
final int rackSize = rackTest.rackNodes.size();
Integer idN2 = rackTest.findNode(testing1);
assert (idN2 != null);
Integer idN3 = rackTest.findNode(testing2);
assert (idN3 != null);
final Integer idFirst = rackInfo.findNode(first);
assert (idFirst != null);
final int sizeFirstRack = rackInfo.rackNodes.size();
final int end = idFirst * rackSize / sizeFirstRack;
// proportional to previous of idFirst
final int prevIdFirst = (idFirst + sizeFirstRack - 1) % sizeFirstRack;
int start = (prevIdFirst * rackSize / sizeFirstRack);
int distPropWindow = (end - start + rackSize) % rackSize;
if (distPropWindow > 0) {
start = (start + 1) % rackSize;
distPropWindow--;
}
int distIdN2 = (idN2 - start + rackSize) % rackSize;
int distIdN3 = (idN3 - start + rackSize) % rackSize;
int distN3N2 = (idN3 - idN2 + rackSize) % rackSize;
int distN2N3 = (idN2 - idN3 + rackSize) % rackSize;
if (distIdN2 <= distPropWindow && distN3N2 < machineWindow)
return true;
if (distIdN3 <= distPropWindow && distN2N3 < machineWindow)
return true;
}
return false;
} finally {
readUnlock();
}
}
/**
* Finds best match considering only the remote nodes.
* @param listOfNodes Datanodes to choose from
* @param result Array containing results
*/
private void findBestWithoutFirst(List<DatanodeDescriptor> listOfNodes,
DatanodeDescriptor[] result) {
readLock();
try {
for (int in2 = 0; in2 < listOfNodes.size(); in2++) {
DatanodeDescriptor n2 = listOfNodes.get(in2);
for (int in3 = in2 + 1; in3 < listOfNodes.size(); in3++) {
DatanodeDescriptor n3 = listOfNodes.get(in3);
if (n2.getNetworkLocation().equals(n3.getNetworkLocation())) {
RackRingInfo rackInfo = racksMap.get(n2.getNetworkLocation());
assert (rackInfo != null);
final int rackSize = rackInfo.rackNodes.size();
final Integer idN2 = rackInfo.findNode(n2);
final Integer idN3 = rackInfo.findNode(n3);
if (idN2 != null && idN3 != null) {
int dist = (idN3 - idN2 + rackSize) % rackSize;
if (dist >= machineWindow) {
dist = rackSize - dist; // try n2 - n3
}
if (dist < machineWindow) {
result[0] = null;
result[1] = n2;
result[2] = n3;
return;
}
}
}
}
}
} finally {
readUnlock();
}
}
/**
* Chooses the third replica, after 2 have been allocated
* @param excludedNodes Nodes that we should not consider
* @param blocksize size of blocks
* @param maxReplicasPerRack maximum number of replicas per rack
* @param results array containing results
* @throws NotEnoughReplicasException
*/
protected void chooseRemainingReplicas (int numOfReplicas,
HashMap<Node, Node> excludedNodes, long blocksize, int maxReplicasPerRack,
List<DatanodeDescriptor> results) throws NotEnoughReplicasException {
readLock();
try {
if (numOfReplicas <= 0) {
return;
}
DatanodeDescriptor[] bestmatch = findBest(results);
if (bestmatch[0] != null) { // there is a first replica: 1,X,X
excludedNodes.put(bestmatch[0], bestmatch[0]);
if (bestmatch[1] == null) { // there is no second replica: 1,0,0
chooseFirstInRemoteRack(bestmatch[0], excludedNodes, blocksize,
maxReplicasPerRack, results); // pick up second
numOfReplicas--;
// now search for the rest recursively
chooseRemainingReplicas(numOfReplicas, excludedNodes, blocksize,
maxReplicasPerRack, results);
return;
} else if (bestmatch[2] == null) { // no third replica: 1,1,0
// find the third one
excludedNodes.put(bestmatch[1], bestmatch[1]);
RackRingInfo rack0 = racksMap.get(bestmatch[0].getNetworkLocation());
RackRingInfo rack1 = racksMap.get(bestmatch[1].getNetworkLocation());
int posR0 = rack0.findNode(bestmatch[0]);
int firstMachine = posR0 * rack1.rackNodes.size()
/ rack0.rackNodes.size();
if (!chooseMachine(bestmatch[1].getNetworkLocation(),
firstMachine, machineWindow,
excludedNodes, blocksize, maxReplicasPerRack, results)) {
// if doen't get it in the rack, try at a different one
LOG.info("Couldn't find 3rd Datanode on the same rack as 2nd. "
+ "Resorting to a different rack in the same node group.");
chooseFirstInRemoteRack(bestmatch[0], excludedNodes, blocksize,
maxReplicasPerRack, results);
}
numOfReplicas--;
}
} else if (bestmatch[1] != null && bestmatch[2] != null) { // 0,1,1
RackRingInfo rackInfo = racksMap.get(bestmatch[1].getNetworkLocation());
Integer posN1 = rackInfo.findNode(bestmatch[1]);
Integer posN2 = rackInfo.findNode(bestmatch[2]);
if (posN1 != null && posN2 != null) {
int rackSize = rackInfo.rackNodes.size();
int diff = (posN2 - posN1 + rackSize) % rackSize;
if (diff >= machineWindow) {
Integer tmp = posN1;
posN1 = posN2;
posN2 = tmp;
diff = rackSize - diff;
}
int newMachineWindow = machineWindow - diff;
assert (newMachineWindow > 0);
if (rackSize - diff < machineWindow) {
newMachineWindow = rackSize;
}
final int firstRack = (rackInfo.index - rackWindow + racks.size())
% racks.size();
int machineIdx = (posN1 - newMachineWindow + 1 + rackSize) % rackSize;
if (chooseRemoteRack(rackInfo.index, firstRack, rackWindow,
machineIdx, newMachineWindow, excludedNodes, blocksize,
maxReplicasPerRack, results, true)) {
numOfReplicas--;
}
}
}
// get the rest randomly
if (numOfReplicas > 0) {
int replicas = results.size();
if (replicas < 3) {
LOG.info("Picking up random replicas from default policy after "
+ results.size() + " replicas have been chosen");
}
super.chooseRandom(numOfReplicas, NodeBase.ROOT, excludedNodes,
blocksize, maxReplicasPerRack, results);
}
} finally {
readUnlock();
}
}
/**
* Picks up a remote machine within defined window
* @param rackIdx rack the request is coming from and that should be avoided
* @param firstRack rack that starts window
* @param rackWindow rack window size
* @param machineIdx index of first replica within its rack
* @param windowSize size of the machine window
* @param excludedNodes list of black listed nodes.
* @param blocksize size of a block
* @param maxReplicasPerRack maximum number of replicas per rack
* @param results List of results
* @param reverse adjustment when looking forward or backward.
* @return
* @throws NotEnoughReplicasException
*/
protected boolean chooseRemoteRack(int rackIdx, int firstRack, int rackWindow,
int machineIdx, int windowSize, HashMap<Node, Node> excludedNodes,
long blocksize, int maxReplicasPerRack, List<DatanodeDescriptor> results,
boolean reverse) throws NotEnoughReplicasException {
// randomly choose one node from remote racks
readLock();
try {
HashSet<Integer> excludedRacks = new HashSet<Integer>();
excludedRacks.add(rackIdx);
int n = racks.size();
int currRackSize = racksMap.get(racks.get(rackIdx)).rackNodes.size();
while (excludedRacks.size() < rackWindow) {
int newRack = randomIntInWindow(firstRack, rackWindow, n, excludedRacks);
if (newRack < 0)
break;
excludedRacks.add(newRack);
int newRackSize = racksMap.get(racks.get(newRack)).rackNodes.size();
int firstMachine = machineIdx * newRackSize / currRackSize;
int newWindowSize = windowSize;
if (reverse) {
firstMachine = ((int) Math.ceil((double) machineIdx * newRackSize
/ currRackSize))
% newRackSize;
newWindowSize = Math.max(1, windowSize * newRackSize / currRackSize);
}
if (newWindowSize <= 0) {
continue;
}
if (chooseMachine(racks.get(newRack), firstMachine, newWindowSize,
excludedNodes, blocksize, maxReplicasPerRack, results)) {
return true;
}
}
return false;
} finally {
readUnlock();
}
}
/**
* Chosses a machine within a window inside a rack
* @param rack rack to choose from
* @param firstMachine machine that starts window
* @param windowSize size of machine window
* @param excludedNodes nodes to avoid
* @param blocksize size of a block
* @param maxReplicasPerRack maximum number of replicas within the same rack
* @param results list of results
* @return
*/
protected boolean chooseMachine(String rack, int firstMachine, int windowSize,
HashMap<Node, Node> excludedNodes, long blocksize,
int maxReplicasPerRack, List<DatanodeDescriptor> results) {
readLock();
try {
HashSet<Integer> excludedMachines = new HashSet<Integer>();
RackRingInfo rackInfo = racksMap.get(rack);
assert (rackInfo != null);
int n = rackInfo.rackNodesMap.size();
List<Node> rackDatanodes = clusterMap.getDatanodesInRack(rack);
if (rackDatanodes == null) {
return false;
}
while (excludedMachines.size() < windowSize) {
int newMachine = randomIntInWindow(firstMachine, windowSize, n,
excludedMachines);
if (newMachine < 0)
return false;
excludedMachines.add(newMachine);
DatanodeDescriptor chosenNode = null;
for (Node node : rackDatanodes) {
DatanodeDescriptor datanode = (DatanodeDescriptor) node;
Integer idx = rackInfo.findNode(datanode);
if (idx != null && idx.intValue() == newMachine) {
chosenNode = datanode;
break;
}
}
if (chosenNode == null)
continue;
Node oldNode = excludedNodes.put(chosenNode, chosenNode);
if (oldNode == null) { // choosendNode was not in the excluded list
if (isGoodTarget(chosenNode, blocksize, maxReplicasPerRack, results)) {
results.add(chosenNode);
return true;
}
}
}
return false;
} finally {
readUnlock();
}
}
/** {@inheritDoc} */
public DatanodeDescriptor chooseReplicaToDelete(FSInodeInfo inode,
Block block, short replicationFactor,
Collection<DatanodeDescriptor> first,
Collection<DatanodeDescriptor> second) {
List<DatanodeDescriptor> nodes = new ArrayList<DatanodeDescriptor>();
if (first != null) {
nodes.addAll(first);
}
if (second != null) {
nodes.addAll(second);
}
DatanodeDescriptor[] best = findBest(nodes);
boolean saved_two_racks = false;
if (best[0] != null && best[1] != null) {
saved_two_racks = true;
}
for (DatanodeDescriptor n : nodes) {
if (saved_two_racks &&
!n.equals(best[0]) && !n.equals(best[1]) && !n.equals(best[2])) {
return n;
}
if (!saved_two_racks &&
(
(best[0] != null && // different from best[0]'s rack
!best[0].getNetworkLocation().equals(n.getNetworkLocation()))
||
(best[1] != null && // different from best[1]'s rack
!best[1].getNetworkLocation().equals(n.getNetworkLocation()))
) ) {
saved_two_racks = true; // just skipped (saved) one machine
// in a different rack
}
}
return super.chooseReplicaToDelete(inode, block, replicationFactor,
first, second);
}
}