/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hdfs; import java.io.IOException; import java.util.List; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.hdfs.MiniDFSCluster.DataNodeProperties; import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction; import org.apache.hadoop.hdfs.server.blockmanagement.BlockManagerTestUtil; import org.apache.hadoop.hdfs.server.namenode.FSNamesystem; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption; import org.apache.hadoop.hdfs.server.namenode.NameNode; import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter; import org.apache.hadoop.test.GenericTestUtils; import static org.junit.Assert.*; import org.junit.Before; import org.junit.After; import org.junit.Test; import com.google.common.base.Supplier; import com.google.common.collect.Lists; /** * Tests to verify safe mode correctness. */ public class TestSafeMode { private static final Path TEST_PATH = new Path("/test"); private static final int BLOCK_SIZE = 1024; Configuration conf; MiniDFSCluster cluster; FileSystem fs; DistributedFileSystem dfs; @Before public void startUp() throws IOException { conf = new HdfsConfiguration(); conf.setInt(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, BLOCK_SIZE); cluster = new MiniDFSCluster.Builder(conf).numDataNodes(1).build(); cluster.waitActive(); fs = cluster.getFileSystem(); dfs = (DistributedFileSystem)fs; } @After public void tearDown() throws IOException { if (fs != null) { fs.close(); } if (cluster != null) { cluster.shutdown(); } } /** * This test verifies that if SafeMode is manually entered, name-node does not * come out of safe mode even after the startup safe mode conditions are met. * <ol> * <li>Start cluster with 1 data-node.</li> * <li>Create 2 files with replication 1.</li> * <li>Re-start cluster with 0 data-nodes. * Name-node should stay in automatic safe-mode.</li> * <li>Enter safe mode manually.</li> * <li>Start the data-node.</li> * <li>Wait longer than <tt>dfs.namenode.safemode.extension</tt> and * verify that the name-node is still in safe mode.</li> * </ol> * * @throws IOException */ @Test public void testManualSafeMode() throws IOException { fs = (DistributedFileSystem)cluster.getFileSystem(); Path file1 = new Path("/tmp/testManualSafeMode/file1"); Path file2 = new Path("/tmp/testManualSafeMode/file2"); // create two files with one block each. DFSTestUtil.createFile(fs, file1, 1000, (short)1, 0); DFSTestUtil.createFile(fs, file2, 1000, (short)1, 0); fs.close(); cluster.shutdown(); // now bring up just the NameNode. cluster = new MiniDFSCluster.Builder(conf).numDataNodes(0).format(false).build(); cluster.waitActive(); dfs = (DistributedFileSystem)cluster.getFileSystem(); assertTrue("No datanode is started. Should be in SafeMode", dfs.setSafeMode(SafeModeAction.SAFEMODE_GET)); // manually set safemode. dfs.setSafeMode(SafeModeAction.SAFEMODE_ENTER); // now bring up the datanode and wait for it to be active. cluster.startDataNodes(conf, 1, true, null, null); cluster.waitActive(); // wait longer than dfs.namenode.safemode.extension try { Thread.sleep(2000); } catch (InterruptedException ignored) {} assertTrue("should still be in SafeMode", dfs.setSafeMode(SafeModeAction.SAFEMODE_GET)); assertFalse("should not be in SafeMode", dfs.setSafeMode(SafeModeAction.SAFEMODE_LEAVE)); } /** * Test that, if there are no blocks in the filesystem, * the NameNode doesn't enter the "safemode extension" period. */ @Test(timeout=45000) public void testNoExtensionIfNoBlocks() throws IOException { cluster.getConfiguration(0).setInt( DFSConfigKeys.DFS_NAMENODE_SAFEMODE_EXTENSION_KEY, 60000); cluster.restartNameNode(); // Even though we have safemode extension set high, we should immediately // exit safemode on startup because there are no blocks in the namespace. String status = cluster.getNameNode().getNamesystem().getSafemode(); assertEquals("", status); } /** * Test that the NN initializes its under-replicated blocks queue * before it is ready to exit safemode (HDFS-1476) */ @Test(timeout=45000) public void testInitializeReplQueuesEarly() throws Exception { // Spray the blocks around the cluster when we add DNs instead of // concentrating all blocks on the first node. BlockManagerTestUtil.setWritingPrefersLocalNode( cluster.getNamesystem().getBlockManager(), false); cluster.startDataNodes(conf, 2, true, StartupOption.REGULAR, null); cluster.waitActive(); DFSTestUtil.createFile(fs, TEST_PATH, 15*BLOCK_SIZE, (short)1, 1L); List<DataNodeProperties> dnprops = Lists.newLinkedList(); dnprops.add(cluster.stopDataNode(0)); dnprops.add(cluster.stopDataNode(0)); dnprops.add(cluster.stopDataNode(0)); cluster.getConfiguration(0).setFloat( DFSConfigKeys.DFS_NAMENODE_REPL_QUEUE_THRESHOLD_PCT_KEY, 1f/15f); cluster.restartNameNode(); final NameNode nn = cluster.getNameNode(); String status = nn.getNamesystem().getSafemode(); assertEquals("Safe mode is ON.The reported blocks 0 needs additional " + "15 blocks to reach the threshold 0.9990 of total blocks 15. " + "Safe mode will be turned off automatically.", status); assertFalse("Mis-replicated block queues should not be initialized " + "until threshold is crossed", NameNodeAdapter.safeModeInitializedReplQueues(nn)); cluster.restartDataNode(dnprops.remove(0)); // Wait for the block report from the restarted DN to come in. GenericTestUtils.waitFor(new Supplier<Boolean>() { @Override public Boolean get() { return NameNodeAdapter.getSafeModeSafeBlocks(nn) > 0; } }, 10, 10000); // SafeMode is fine-grain synchronized, so the processMisReplicatedBlocks // call is still going on at this point - wait until it's done by grabbing // the lock. nn.getNamesystem().writeLock(); nn.getNamesystem().writeUnlock(); int safe = NameNodeAdapter.getSafeModeSafeBlocks(nn); assertTrue("Expected first block report to make some but not all blocks " + "safe. Got: " + safe, safe >= 1 && safe < 15); BlockManagerTestUtil.updateState(nn.getNamesystem().getBlockManager()); assertTrue(NameNodeAdapter.safeModeInitializedReplQueues(nn)); assertEquals(15 - safe, nn.getNamesystem().getUnderReplicatedBlocks()); cluster.restartDataNodes(); } /** * Test that, when under-replicated blocks are processed at the end of * safe-mode, blocks currently under construction are not considered * under-construction or missing. Regression test for HDFS-2822. */ @Test public void testRbwBlocksNotConsideredUnderReplicated() throws IOException { List<FSDataOutputStream> stms = Lists.newArrayList(); try { // Create some junk blocks so that the NN doesn't just immediately // exit safemode on restart. DFSTestUtil.createFile(fs, new Path("/junk-blocks"), BLOCK_SIZE*4, (short)1, 1L); // Create several files which are left open. It's important to // create several here, because otherwise the first iteration of the // replication monitor will pull them off the replication queue and // hide this bug from the test! for (int i = 0; i < 10; i++) { FSDataOutputStream stm = fs.create( new Path("/append-" + i), true, BLOCK_SIZE, (short) 1, BLOCK_SIZE); stms.add(stm); stm.write(1); stm.hflush(); } cluster.restartNameNode(); FSNamesystem ns = cluster.getNameNode(0).getNamesystem(); BlockManagerTestUtil.updateState(ns.getBlockManager()); assertEquals(0, ns.getPendingReplicationBlocks()); assertEquals(0, ns.getCorruptReplicaBlocks()); assertEquals(0, ns.getMissingBlocksCount()); } finally { for (FSDataOutputStream stm : stms) { IOUtils.closeStream(stm); } cluster.shutdown(); } } public interface FSRun { public abstract void run(FileSystem fs) throws IOException; } /** * Assert that the given function fails to run due to a safe * mode exception. */ public void runFsFun(String msg, FSRun f) { try { f.run(fs); fail(msg); } catch (IOException ioe) { assertTrue(ioe.getMessage().contains("safe mode")); } } /** * Run various fs operations while the NN is in safe mode, * assert that they are either allowed or fail as expected. */ @Test public void testOperationsWhileInSafeMode() throws IOException { final Path file1 = new Path("/file1"); assertFalse(dfs.setSafeMode(SafeModeAction.SAFEMODE_GET)); DFSTestUtil.createFile(fs, file1, 1024, (short)1, 0); assertTrue("Could not enter SM", dfs.setSafeMode(SafeModeAction.SAFEMODE_ENTER)); runFsFun("Set quota while in SM", new FSRun() { public void run(FileSystem fs) throws IOException { ((DistributedFileSystem)fs).setQuota(file1, 1, 1); }}); runFsFun("Set perm while in SM", new FSRun() { public void run(FileSystem fs) throws IOException { fs.setPermission(file1, FsPermission.getDefault()); }}); runFsFun("Set owner while in SM", new FSRun() { public void run(FileSystem fs) throws IOException { fs.setOwner(file1, "user", "group"); }}); runFsFun("Set repl while in SM", new FSRun() { public void run(FileSystem fs) throws IOException { fs.setReplication(file1, (short)1); }}); runFsFun("Append file while in SM", new FSRun() { public void run(FileSystem fs) throws IOException { DFSTestUtil.appendFile(fs, file1, "new bytes"); }}); runFsFun("Delete file while in SM", new FSRun() { public void run(FileSystem fs) throws IOException { fs.delete(file1, false); }}); runFsFun("Rename file while in SM", new FSRun() { public void run(FileSystem fs) throws IOException { fs.rename(file1, new Path("file2")); }}); try { fs.setTimes(file1, 0, 0); } catch (IOException ioe) { fail("Set times failed while in SM"); } try { DFSTestUtil.readFile(fs, file1); } catch (IOException ioe) { fail("Set times failed while in SM"); } assertFalse("Could not leave SM", dfs.setSafeMode(SafeModeAction.SAFEMODE_LEAVE)); } /** * Verify that the NameNode stays in safemode when dfs.safemode.datanode.min * is set to a number greater than the number of live datanodes. */ @Test public void testDatanodeThreshold() throws IOException { cluster.shutdown(); Configuration conf = cluster.getConfiguration(0); conf.setInt(DFSConfigKeys.DFS_NAMENODE_SAFEMODE_EXTENSION_KEY, 0); conf.setInt(DFSConfigKeys.DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY, 1); cluster.restartNameNode(); fs = (DistributedFileSystem)cluster.getFileSystem(); String tipMsg = cluster.getNamesystem().getSafemode(); assertTrue("Safemode tip message looks right: " + tipMsg, tipMsg.contains("The number of live datanodes 0 needs an additional " + "1 live datanodes to reach the minimum number 1. " + "Safe mode will be turned off automatically.")); // Start a datanode cluster.startDataNodes(conf, 1, true, null, null); // Wait long enough for safemode check to refire try { Thread.sleep(1000); } catch (InterruptedException ignored) {} // We now should be out of safe mode. assertEquals("", cluster.getNamesystem().getSafemode()); } }