TestBlocksWithNotEnoughRacks.java example

Explorer
hdfs-cloudera-cdh3u3-production-master
- src
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hdfs.server.namenode;

import java.util.ArrayList;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.DFSTestUtil;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.server.datanode.DataNode;
import org.apache.hadoop.hdfs.server.namenode.NameNode;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.commons.logging.impl.Log4JLogger;
import org.apache.log4j.Level;

import org.junit.Ignore;
import org.junit.Test;
import static org.junit.Assert.*;

/**
 * Tests that rack policy is properly maintained during various operations.
 * 
 * NOTE: many of the test cases here are Ignored in CDH but not in trunk.
 * The reasoning is that trunk has HDFS-15, which causes rack policy to
 * get checked on every replication decision. This is a potential performance
 * issue that has not seen enough scale testing in trunk yet, so it
 * has not been backported. Thus, in any case where replication has been
 * reached the desired number of replicas, new replicas will not be made
 * just to satisfy the rack policy. If HDFS-15 is backported, these tests
 * may be re-enabled.
 */
public class TestBlocksWithNotEnoughRacks {
  public static final Log LOG = LogFactory.getLog(TestBlocksWithNotEnoughRacks.class);
  static {
    ((Log4JLogger)NameNode.stateChangeLog).getLogger().setLevel(Level.ALL);
    ((Log4JLogger)FSNamesystem.LOG).getLogger().setLevel(Level.ALL);        
  }

  /*
   * Wait up to 10s for the given block to be replicated across 
   * the requested number of racks, with the requested number of 
   * replicas, and the requested number of replicas still needed. 
   */
  private void waitForReplication(FSNamesystem ns, Block b, int racks, 
      int replicas, int neededReplicas) throws Exception {
    int curRacks = ns.getNumberOfRacks(b);
    int curReplicas = ns.countNodes(b).liveReplicas();
    int curNeededReplicas = ns.neededReplications.size();
    int count = 0;

    while (curRacks < racks || curReplicas < replicas || 
           curNeededReplicas > neededReplicas) {
      if (++count == 10) { 
        break;
      }
      Thread.sleep(1000);
      curRacks = ns.getNumberOfRacks(b);
      curReplicas = ns.countNodes(b).liveReplicas();
      curNeededReplicas = ns.neededReplications.size();
    }
    assertEquals(racks, curRacks);
    assertEquals(replicas, curReplicas);
    assertEquals(neededReplicas, curNeededReplicas);
  }

  /*
   * Wait for the given DN (host:port) to be decommissioned.
   */
  private void waitForDecommission(FileSystem fs, String name) 
      throws Exception {
    DatanodeInfo dn = null;
    while (dn == null || dn.isDecommissionInProgress() || !dn.isDecommissioned()) {
      Thread.sleep(1000);
      DistributedFileSystem dfs = (DistributedFileSystem)fs;
      for (DatanodeInfo info : dfs.getDataNodeStats()) {
        if (name.equals(info.getName())) {
          dn = info;
        }
      }      
    }
  }

  /*
   * Return a configuration object with low timeouts for testing and 
   * a topology script set (which enables rack awareness).  
   */
  private Configuration getConf() {
    Configuration conf = new Configuration();
    conf.setLong("dfs.heartbeat.interval", 1L);
    conf.setInt("dfs.replication.interval", 1);
    conf.setInt("dfs.replication.pending.timeout.sec", 1);
    conf.setLong("dfs.blockreport.intervalMsec", 1000L);
    conf.set("topology.script.file.name", "xyz");
    return conf;
  }

  /*
   * Write the given string to the given file.
   */
  private void writeFile(FileSystem fs, Path p, String s) throws Exception {
    if (fs.exists(p)) {
      fs.delete(p, true);
    }
    FSDataOutputStream stm = fs.create(p);
    stm.writeBytes(s);
    stm.writeBytes("\n");
    stm.close();
  }

  /*
   * Creates a block with all datanodes on the same rack, though the block
   * is sufficiently replicated. Adds an additional datanode on a new rack. 
   * The block should be replicated to the new rack.
   */
  @Test
  @Ignore("See javadoc at top of class")
  public void testSufficientlyReplBlocksUsesNewRack() throws Exception {
    Configuration conf = getConf();
    final short REPLICATION_FACTOR = 3;
    final Path filePath = new Path("/testFile");
    // All datanodes are on the same rack
    String racks[] = {"/rack1", "/rack1", "/rack1"};
    MiniDFSCluster cluster = new MiniDFSCluster(conf, racks.length, true, racks);
    final FSNamesystem ns = cluster.getNameNode().getNamesystem();

    try {
      // Create a file with one block with a replication factor of 3
      final FileSystem fs = cluster.getFileSystem();
      DFSTestUtil.createFile(fs, filePath, 1L, REPLICATION_FACTOR, 1L);
      Block b = DFSTestUtil.getFirstBlock(fs, filePath);
      waitForReplication(ns, b, 1, REPLICATION_FACTOR, 1);      

      // Add a new datanode on a different rack
      String newRacks[] = {"/rack2"};
      cluster.startDataNodes(conf, 1, true, null, newRacks);
      cluster.waitActive();

      waitForReplication(ns, b, 2, REPLICATION_FACTOR, 0);
    } finally {
      cluster.shutdown();
    }
  }

  /*
   * Creates a block with all datanodes on the same rack. Add additional
   * datanodes on a different rack and increase the replication factor, 
   * making sure there are enough replicas across racks. If the previous
   * test passes this one should too, however this test may pass when
   * the previous one fails because the replication code is explicitly
   * triggered by setting the replication factor.
   */
  @Test
  @Ignore("See javadoc at top of class")
  public void testUnderReplicatedUsesNewRacks() throws Exception {
    Configuration conf = getConf();
    short REPLICATION_FACTOR = 3;
    final Path filePath = new Path("/testFile");
    // All datanodes are on the same rack
    String racks[] = {"/rack1", "/rack1", "/rack1", "/rack1", "/rack1"};
    MiniDFSCluster cluster = new MiniDFSCluster(conf, racks.length, true, racks);
    final FSNamesystem ns = cluster.getNameNode().getNamesystem();

    try {
      // Create a file with one block
      final FileSystem fs = cluster.getFileSystem();
      DFSTestUtil.createFile(fs, filePath, 1L, REPLICATION_FACTOR, 1L);
      Block b = DFSTestUtil.getFirstBlock(fs, filePath);
      waitForReplication(ns, b, 1, REPLICATION_FACTOR, 1);
      
      // Add new datanodes on a different rack and increase the
      // replication factor so the block is underreplicated and make
      // sure at least one of the hosts on the new rack is used. 
      String newRacks[] = {"/rack2", "/rack2"};
      cluster.startDataNodes(conf, 2, true, null, newRacks);
      REPLICATION_FACTOR = 5;
      ns.setReplication("/testFile", REPLICATION_FACTOR); 

      waitForReplication(ns, b, 2, REPLICATION_FACTOR, 0);
    } finally {
      cluster.shutdown();
    }
  }

  /*
   * Mark a block as corrupt, test that when it is re-replicated that it
   * is still replicated across racks.
   */
  @Test
  @Ignore("See class JavaDoc")
  public void testCorruptBlockRereplicatedAcrossRacks() throws Exception {
    Configuration conf = getConf();
    short REPLICATION_FACTOR = 2;
    final Path filePath = new Path("/testFile");
    // Last datanode is on a different rack
    String racks[] = {"/rack1", "/rack1", "/rack2"};
    MiniDFSCluster cluster = new MiniDFSCluster(conf, racks.length, true, racks);
    final FSNamesystem ns = cluster.getNameNode().getNamesystem();

    try {
      // Create a file with one block with a replication factor of 2
      final FileSystem fs = cluster.getFileSystem();
      DFSTestUtil.createFile(fs, filePath, 1L, REPLICATION_FACTOR, 1L);
      Block b = DFSTestUtil.getFirstBlock(fs, filePath);
      waitForReplication(ns, b, 2, REPLICATION_FACTOR, 0);

      // 3rd datanode reports that the block is corrupt
      ArrayList<DataNode> datanodes = cluster.getDataNodes();
      assertEquals(3, datanodes.size());
      DataNode dataNode = datanodes.get(2);
      cluster.getNameNode().namesystem.markBlockAsCorrupt(b, 
          new DatanodeInfo(dataNode.dnRegistration));

      // The rack policy is still respected (the 3rd datanode got
      // the new replica even though it reported the corrupt block).
      waitForReplication(ns, b, 2, REPLICATION_FACTOR, 0);
    } finally {
      cluster.shutdown();
    }
  }

  /*
   * Reduce the replication factor of a file, making sure that the only
   * block that is across racks is not removed when deleting replicas.
   */
  @Test
  public void testReduceReplFactorRespectsRackPolicy() throws Exception {
    Configuration conf = getConf();
    short REPLICATION_FACTOR = 3;
    final Path filePath = new Path("/testFile");
    String racks[] = {"/rack1", "/rack1", "/rack2", "/rack2"};
    MiniDFSCluster cluster = new MiniDFSCluster(conf, racks.length, true, racks);
    final FSNamesystem ns = cluster.getNameNode().getNamesystem();

    try {
      // Create a file with one block
      final FileSystem fs = cluster.getFileSystem();
      DFSTestUtil.createFile(fs, filePath, 1L, REPLICATION_FACTOR, 1L);
      Block b = DFSTestUtil.getFirstBlock(fs, filePath);
      waitForReplication(ns, b, 2, REPLICATION_FACTOR, 0);

      // Decrease the replication factor, make sure the deleted replica
      // was not the one that lived on the rack with only one replica,
      // ie we should still have 2 racks after reducing the repl factor.
      REPLICATION_FACTOR = 2;
      ns.setReplication("/testFile", REPLICATION_FACTOR); 

      waitForReplication(ns, b, 2, REPLICATION_FACTOR, 0);
    } finally {
      cluster.shutdown();
    }
  }

  /*
   * Test that when a block is replicated because a replica is lost due
   * to host failure the the rack policy is preserved.
   */
  @Test
  public void testReplDueToNodeFailRespectsRackPolicy() throws Exception {
    Configuration conf = getConf();
    short REPLICATION_FACTOR = 3;
    final Path filePath = new Path("/testFile");
    // Last datanode is on a different rack
    String racks[] = {"/rack1", "/rack1", "/rack1", "/rack2", "/rack2"};
    MiniDFSCluster cluster = new MiniDFSCluster(conf, racks.length, true, racks);
    final FSNamesystem ns = cluster.getNameNode().getNamesystem();

    try {
      // Create a file with one block with a replication factor of 2
      final FileSystem fs = cluster.getFileSystem();
      DFSTestUtil.createFile(fs, filePath, 1L, REPLICATION_FACTOR, 1L);
      Block b = DFSTestUtil.getFirstBlock(fs, filePath);
      waitForReplication(ns, b, 2, REPLICATION_FACTOR, 0);

      // Make the last datanode look like it failed to heartbeat by 
      // calling removeDatanode and stopping it.
      ArrayList<DataNode> datanodes = cluster.getDataNodes();
      int idx = datanodes.size() - 1;
      DataNode dataNode = datanodes.get(idx);
      cluster.stopDataNode(idx);
      ns.removeDatanode(dataNode.dnRegistration);

      // The block should still have sufficient # replicas, across racks.
      // The last node may not have contained a replica, but if it did
      // it should have been replicated within the same rack.
      waitForReplication(ns, b, 2, REPLICATION_FACTOR, 0);
      
      // Fail the last datanode again, it's also on rack2 so there is
      // only 1 rack for all the replicas
      datanodes = cluster.getDataNodes();
      idx = datanodes.size() - 1;
      dataNode = datanodes.get(idx);
      cluster.stopDataNode(idx);
      ns.removeDatanode(dataNode.dnRegistration);

      
      // Make sure we have enough live replicas even though we are
      // short one rack and therefore need one replica
      /**
       * In trunk, the assertion here is the following:
       * waitForReplication(ns, b, 1, REPLICATION_FACTOR, 1);
       * 
       * i.e. that the block is on 1 rack with 2 replicas, and wants one
       * more replica, since it isn't achieving the right rack policy.
       * In 0.20, if there is only one live rack, then this block
       * is not considered under-replicated, since rack policy is
       * unachievable (see class JavaDoc above).
       */
      waitForReplication(ns, b, 1, REPLICATION_FACTOR, 0);
    } finally {
      cluster.shutdown();
    }
  }
  
  /*
   * Test that when the execss replicas of a block are reduced due to a 
   * node re-joining the cluster the rack policy is not violated.
   */
  @Test
  @Ignore("See javadoc at top of class")
  public void testReduceReplFactorDueToRejoinRespectsRackPolicy() throws Exception {
    Configuration conf = getConf();
    short REPLICATION_FACTOR = 2;
    final Path filePath = new Path("/testFile");
    // Last datanode is on a different rack
    String racks[] = {"/rack1", "/rack1", "/rack2"};
    MiniDFSCluster cluster = new MiniDFSCluster(conf, racks.length, true, racks);
    final FSNamesystem ns = cluster.getNameNode().getNamesystem();

    try {
      // Create a file with one block
      final FileSystem fs = cluster.getFileSystem();
      DFSTestUtil.createFile(fs, filePath, 1L, REPLICATION_FACTOR, 1L);
      Block b = DFSTestUtil.getFirstBlock(fs, filePath);
      waitForReplication(ns, b, 2, REPLICATION_FACTOR, 0);

      // Make the last (cross rack) datanode look like it failed
      // to heartbeat by stopping it and calling removeDatanode.
      ArrayList<DataNode> datanodes = cluster.getDataNodes();
      assertEquals(3, datanodes.size());
      DataNode dataNode = datanodes.get(2);
      cluster.stopDataNode(2);
      ns.removeDatanode(dataNode.dnRegistration);

      // The block gets re-replicated to another datanode so it has a 
      // sufficient # replicas, but not across racks, so there should
      // be 1 rack, and 1 needed replica (even though there are 2 hosts 
      // available and only 2 replicas required).
      waitForReplication(ns, b, 1, REPLICATION_FACTOR, 1);

      // Start the "failed" datanode, which has a replica so the block is
      // now over-replicated and therefore a replica should be removed but
      // not on the restarted datanode as that would violate the rack policy.
      String rack2[] = {"/rack2"};
      cluster.startDataNodes(conf, 1, true, null, rack2);
      cluster.waitActive();      
      
      // The block now has sufficient # replicas, across racks
      waitForReplication(ns, b, 2, REPLICATION_FACTOR, 0);
    } finally {
      cluster.shutdown();
    }
  }

  /*
   * Test that rack policy is still respected when blocks are replicated
   * due to node decommissioning.
   */
  @Test
  public void testNodeDecomissionRespectsRackPolicy() throws Exception {
    Configuration conf = getConf();
    short REPLICATION_FACTOR = 2;
    final Path filePath = new Path("/testFile");

    // Configure an excludes file
    FileSystem localFileSys = FileSystem.getLocal(conf);
    Path workingDir = localFileSys.getWorkingDirectory();
    Path dir = new Path(workingDir, "build/test/data/temp/decommission");
    Path excludeFile = new Path(dir, "exclude");
    assertTrue(localFileSys.mkdirs(dir));
    writeFile(localFileSys, excludeFile, "");
    conf.set("dfs.hosts.exclude", excludeFile.toUri().getPath());

    // Two blocks and four racks
    String racks[] = {"/rack1", "/rack1", "/rack2", "/rack2"};
    MiniDFSCluster cluster = new MiniDFSCluster(conf, racks.length, true, racks);
    final FSNamesystem ns = cluster.getNameNode().getNamesystem();

    try {
      // Create a file with one block
      final FileSystem fs = cluster.getFileSystem();
      DFSTestUtil.createFile(fs, filePath, 1L, REPLICATION_FACTOR, 1L);
      Block b = DFSTestUtil.getFirstBlock(fs, filePath);
      waitForReplication(ns, b, 2, REPLICATION_FACTOR, 0);

      // Decommission one of the hosts with the block, this should cause 
      // the block to get replicated to another host on the same rack,
      // otherwise the rack policy is violated.
      BlockLocation locs[] = fs.getFileBlockLocations(
          fs.getFileStatus(filePath), 0, Long.MAX_VALUE);
      String name = locs[0].getNames()[0];
      writeFile(localFileSys, excludeFile, name);
      ns.refreshNodes(conf);
      waitForDecommission(fs, name);

      // Check the block still has sufficient # replicas across racks
      waitForReplication(ns, b, 2, REPLICATION_FACTOR, 0);
    } finally {
      cluster.shutdown();
    }
  }

  /*
   * Test that rack policy is still respected when blocks are replicated
   * due to node decommissioning, when the blocks are over-replicated.
   */
  @Test
  public void testNodeDecomissionWithOverreplicationRespectsRackPolicy() 
      throws Exception {
    Configuration conf = getConf();
    short REPLICATION_FACTOR = 5;
    final Path filePath = new Path("/testFile");

    // Configure an excludes file
    FileSystem localFileSys = FileSystem.getLocal(conf);
    Path workingDir = localFileSys.getWorkingDirectory();
    Path dir = new Path(workingDir, "build/test/data/temp/decommission");
    Path excludeFile = new Path(dir, "exclude");
    assertTrue(localFileSys.mkdirs(dir));
    writeFile(localFileSys, excludeFile, "");
    conf.set("dfs.hosts.exclude", excludeFile.toUri().getPath());

    // All hosts are on two racks, only one host on /rack2
    String racks[] = {"/rack1", "/rack2", "/rack1", "/rack1", "/rack1"};
    MiniDFSCluster cluster = new MiniDFSCluster(conf, racks.length, true, racks);
    final FSNamesystem ns = cluster.getNameNode().getNamesystem();

    try {
      final FileSystem fs = cluster.getFileSystem();
      DFSTestUtil.createFile(fs, filePath, 1L, REPLICATION_FACTOR, 1L);
      Block b = DFSTestUtil.getFirstBlock(fs, filePath);
      waitForReplication(ns, b, 2, REPLICATION_FACTOR, 0);

      // Lower the replication factor so the blocks are over replicated
      REPLICATION_FACTOR = 2;
      fs.setReplication(filePath, REPLICATION_FACTOR);
      
      // Decommission one of the hosts with the block that is not on
      // the lone host on rack2 (if we decomission that host it would
      // be impossible to respect the rack policy).
      BlockLocation locs[] = fs.getFileBlockLocations(
          fs.getFileStatus(filePath), 0, Long.MAX_VALUE);
      String[] tops = locs[0].getTopologyPaths();
      for (String top : tops) {
        if (!top.startsWith("/rack2")) {
          String name = top.substring("/rack1".length()+1);
          writeFile(localFileSys, excludeFile, name);
          ns.refreshNodes(conf);
          waitForDecommission(fs, name);
          break;
        }
      }

      // Check the block still has sufficient # replicas across racks,
      // ie we didn't remove the replica on the host on /rack1.
      waitForReplication(ns, b, 2, REPLICATION_FACTOR, 0);
    } finally {
      cluster.shutdown();
    }
  }
}