/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hdfs.server.namenode.ha; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; import java.io.IOException; import java.net.InetSocketAddress; import java.net.URI; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.impl.Log4JLogger; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.ha.HAServiceProtocol.RequestSource; import org.apache.hadoop.ha.HAServiceProtocol.StateChangeRequestInfo; import org.apache.hadoop.hdfs.DFSConfigKeys; import org.apache.hadoop.hdfs.DFSTestUtil; import org.apache.hadoop.hdfs.DistributedFileSystem; import org.apache.hadoop.hdfs.MiniDFSCluster; import org.apache.hadoop.hdfs.MiniDFSNNTopology; import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction; import org.apache.hadoop.hdfs.server.blockmanagement.BlockManagerTestUtil; import org.apache.hadoop.hdfs.server.namenode.FSImage; import org.apache.hadoop.hdfs.server.namenode.FSNamesystem; import org.apache.hadoop.hdfs.server.namenode.NameNode; import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.ipc.RemoteException; import org.apache.hadoop.ipc.StandbyException; import org.apache.hadoop.test.GenericTestUtils; import org.apache.log4j.Level; import org.junit.After; import org.junit.Before; import org.junit.Test; import com.google.common.base.Supplier; import com.google.common.collect.Lists; /** * Tests that exercise safemode in an HA cluster. */ public class TestHASafeMode { private static final Log LOG = LogFactory.getLog(TestHASafeMode.class); private static final int BLOCK_SIZE = 1024; private NameNode nn0; private NameNode nn1; private FileSystem fs; private MiniDFSCluster cluster; static { ((Log4JLogger)LogFactory.getLog(FSImage.class)).getLogger().setLevel(Level.ALL); ((Log4JLogger)LogFactory.getLog(FSNamesystem.class)).getLogger().setLevel(Level.ALL); ((Log4JLogger)NameNode.stateChangeLog).getLogger().setLevel(Level.ALL); } @Before public void setupCluster() throws Exception { Configuration conf = new Configuration(); conf.setInt(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, BLOCK_SIZE); conf.setInt(DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY, 1); conf.setInt(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, 1); cluster = new MiniDFSCluster.Builder(conf) .nnTopology(MiniDFSNNTopology.simpleHATopology()) .numDataNodes(3) .waitSafeMode(false) .build(); cluster.waitActive(); nn0 = cluster.getNameNode(0); nn1 = cluster.getNameNode(1); fs = HATestUtil.configureFailoverFs(cluster, conf); cluster.transitionToActive(0); } @After public void shutdownCluster() { if (cluster != null) { cluster.shutdown(); } } /** * Make sure the client retries when the active NN is in safemode */ @Test (timeout=300000) public void testClientRetrySafeMode() throws Exception { final Map<Path, Boolean> results = Collections .synchronizedMap(new HashMap<Path, Boolean>()); final Path test = new Path("/test"); // let nn0 enter safemode NameNodeAdapter.enterSafeMode(nn0, false); LOG.info("enter safemode"); new Thread() { @Override public void run() { try { boolean mkdir = fs.mkdirs(test); LOG.info("mkdir finished, result is " + mkdir); synchronized (TestHASafeMode.this) { results.put(test, mkdir); TestHASafeMode.this.notifyAll(); } } catch (Exception e) { LOG.info("Got Exception while calling mkdir", e); } } }.start(); // make sure the client's call has actually been handled by the active NN assertFalse("The directory should not be created while NN in safemode", fs.exists(test)); Thread.sleep(1000); // let nn0 leave safemode NameNodeAdapter.leaveSafeMode(nn0); LOG.info("leave safemode"); synchronized (this) { while (!results.containsKey(test)) { this.wait(); } assertTrue(results.get(test)); } } private void restartStandby() throws IOException { cluster.shutdownNameNode(1); // Set the safemode extension to be lengthy, so that the tests // can check the safemode message after the safemode conditions // have been achieved, without being racy. cluster.getConfiguration(1).setInt( DFSConfigKeys.DFS_NAMENODE_SAFEMODE_EXTENSION_KEY, 30000); cluster.getConfiguration(1).setInt( DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, 1); cluster.restartNameNode(1); nn1 = cluster.getNameNode(1); assertEquals(nn1.getNamesystem().getTransactionsSinceLastLogRoll(), 0L); } /** * Test case for enter safemode in active namenode, when it is already in startup safemode. * It is a regression test for HDFS-2747. */ @Test public void testEnterSafeModeInANNShouldNotThrowNPE() throws Exception { banner("Restarting active"); DFSTestUtil .createFile(fs, new Path("/test"), 3 * BLOCK_SIZE, (short) 3, 1L); restartActive(); nn0.getRpcServer().transitionToActive( new StateChangeRequestInfo(RequestSource.REQUEST_BY_USER)); FSNamesystem namesystem = nn0.getNamesystem(); String status = namesystem.getSafemode(); assertTrue("Bad safemode status: '" + status + "'", status .startsWith("Safe mode is ON.")); NameNodeAdapter.enterSafeMode(nn0, false); assertTrue("Failed to enter into safemode in active", namesystem .isInSafeMode()); NameNodeAdapter.enterSafeMode(nn0, false); assertTrue("Failed to enter into safemode in active", namesystem .isInSafeMode()); } /** * Test case for enter safemode in standby namenode, when it is already in startup safemode. * It is a regression test for HDFS-2747. */ @Test public void testEnterSafeModeInSBNShouldNotThrowNPE() throws Exception { banner("Starting with NN0 active and NN1 standby, creating some blocks"); DFSTestUtil .createFile(fs, new Path("/test"), 3 * BLOCK_SIZE, (short) 3, 1L); // Roll edit log so that, when the SBN restarts, it will load // the namespace during startup and enter safemode. nn0.getRpcServer().rollEditLog(); banner("Creating some blocks that won't be in the edit log"); DFSTestUtil.createFile(fs, new Path("/test2"), 5 * BLOCK_SIZE, (short) 3, 1L); banner("Deleting the original blocks"); fs.delete(new Path("/test"), true); banner("Restarting standby"); restartStandby(); FSNamesystem namesystem = nn1.getNamesystem(); String status = namesystem.getSafemode(); assertTrue("Bad safemode status: '" + status + "'", status .startsWith("Safe mode is ON.")); NameNodeAdapter.enterSafeMode(nn1, false); assertTrue("Failed to enter into safemode in standby", namesystem .isInSafeMode()); NameNodeAdapter.enterSafeMode(nn1, false); assertTrue("Failed to enter into safemode in standby", namesystem .isInSafeMode()); } private void restartActive() throws IOException { cluster.shutdownNameNode(0); // Set the safemode extension to be lengthy, so that the tests // can check the safemode message after the safemode conditions // have been achieved, without being racy. cluster.getConfiguration(0).setInt( DFSConfigKeys.DFS_NAMENODE_SAFEMODE_EXTENSION_KEY, 30000); cluster.restartNameNode(0); nn0 = cluster.getNameNode(0); } /** * Tests the case where, while a standby is down, more blocks are * added to the namespace, but not rolled. So, when it starts up, * it receives notification about the new blocks during * the safemode extension period. */ @Test public void testBlocksAddedBeforeStandbyRestart() throws Exception { banner("Starting with NN0 active and NN1 standby, creating some blocks"); DFSTestUtil.createFile(fs, new Path("/test"), 3*BLOCK_SIZE, (short) 3, 1L); // Roll edit log so that, when the SBN restarts, it will load // the namespace during startup. nn0.getRpcServer().rollEditLog(); banner("Creating some blocks that won't be in the edit log"); DFSTestUtil.createFile(fs, new Path("/test2"), 5*BLOCK_SIZE, (short) 3, 1L); banner("Restarting standby"); restartStandby(); // We expect it not to be stuck in safemode, since those blocks // that are already visible to the SBN should be processed // in the initial block reports. assertSafeMode(nn1, 3, 3, 3, 0); banner("Waiting for standby to catch up to active namespace"); HATestUtil.waitForStandbyToCatchUp(nn0, nn1); assertSafeMode(nn1, 8, 8, 3, 0); } /** * Similar to {@link #testBlocksAddedBeforeStandbyRestart()} except that * the new blocks are allocated after the SBN has restarted. So, the * blocks were not present in the original block reports at startup * but are reported separately by blockReceived calls. */ @Test public void testBlocksAddedWhileInSafeMode() throws Exception { banner("Starting with NN0 active and NN1 standby, creating some blocks"); DFSTestUtil.createFile(fs, new Path("/test"), 3*BLOCK_SIZE, (short) 3, 1L); // Roll edit log so that, when the SBN restarts, it will load // the namespace during startup. nn0.getRpcServer().rollEditLog(); banner("Restarting standby"); restartStandby(); assertSafeMode(nn1, 3, 3, 3, 0); // Create a few blocks which will send blockReceived calls to the // SBN. banner("Creating some blocks while SBN is in safe mode"); DFSTestUtil.createFile(fs, new Path("/test2"), 5*BLOCK_SIZE, (short) 3, 1L); banner("Waiting for standby to catch up to active namespace"); HATestUtil.waitForStandbyToCatchUp(nn0, nn1); assertSafeMode(nn1, 8, 8, 3, 0); } /** * Test for the following case proposed by ATM: * 1. Both NNs are up, one is active. There are 100 blocks. Both are * out of safemode. * 2. 10 block deletions get processed by NN1. NN2 enqueues these DN messages * until it next reads from a checkpointed edits file. * 3. NN2 gets restarted. Its queues are lost. * 4. NN2 comes up, reads from all the finalized edits files. Concludes there * should still be 100 blocks. * 5. NN2 receives a block report from all the DNs, which only accounts for * 90 blocks. It doesn't leave safemode. * 6. NN1 dies or is transitioned to standby. * 7. NN2 is transitioned to active. It reads all the edits from NN1. It now * knows there should only be 90 blocks, but it's still in safemode. * 8. NN2 doesn't ever recheck whether it should leave safemode. * * This is essentially the inverse of {@link #testBlocksAddedBeforeStandbyRestart()} */ @Test public void testBlocksRemovedBeforeStandbyRestart() throws Exception { banner("Starting with NN0 active and NN1 standby, creating some blocks"); DFSTestUtil.createFile(fs, new Path("/test"), 5*BLOCK_SIZE, (short) 3, 1L); // Roll edit log so that, when the SBN restarts, it will load // the namespace during startup. nn0.getRpcServer().rollEditLog(); // Delete those blocks again, so they won't get reported to the SBN // once it starts up banner("Removing the blocks without rolling the edit log"); fs.delete(new Path("/test"), true); BlockManagerTestUtil.computeAllPendingWork( nn0.getNamesystem().getBlockManager()); cluster.triggerHeartbeats(); banner("Restarting standby"); restartStandby(); assertSafeMode(nn1, 0, 5, 3, 0); banner("Waiting for standby to catch up to active namespace"); HATestUtil.waitForStandbyToCatchUp(nn0, nn1); assertSafeMode(nn1, 0, 0, 3, 0); } /** * Similar to {@link #testBlocksRemovedBeforeStandbyRestart()} except that * the blocks are removed after the SBN has restarted. So, the * blocks were present in the original block reports at startup * but are deleted separately later by deletion reports. */ @Test public void testBlocksRemovedWhileInSafeMode() throws Exception { banner("Starting with NN0 active and NN1 standby, creating some blocks"); DFSTestUtil.createFile(fs, new Path("/test"), 10*BLOCK_SIZE, (short) 3, 1L); // Roll edit log so that, when the SBN restarts, it will load // the namespace during startup. nn0.getRpcServer().rollEditLog(); banner("Restarting standby"); restartStandby(); // It will initially have all of the blocks necessary. assertSafeMode(nn1, 10, 10, 3, 0); // Delete those blocks while the SBN is in safe mode. // This doesn't affect the SBN, since deletions are not // ACKed when due to block removals. banner("Removing the blocks without rolling the edit log"); fs.delete(new Path("/test"), true); BlockManagerTestUtil.computeAllPendingWork( nn0.getNamesystem().getBlockManager()); banner("Triggering deletions on DNs and Deletion Reports"); cluster.triggerHeartbeats(); HATestUtil.waitForDNDeletions(cluster); cluster.triggerDeletionReports(); assertSafeMode(nn1, 10, 10, 3, 0); // When we catch up to active namespace, it will restore back // to 0 blocks. banner("Waiting for standby to catch up to active namespace"); HATestUtil.waitForStandbyToCatchUp(nn0, nn1); assertSafeMode(nn1, 0, 0, 3, 0); } /** * Tests that the standby node properly tracks the number of total * and safe blocks while it is in safe mode. Since safe-mode only * counts completed blocks, append needs to decrement the total * number of blocks and then re-increment when the file is closed * again. */ @Test public void testAppendWhileInSafeMode() throws Exception { banner("Starting with NN0 active and NN1 standby, creating some blocks"); // Make 4.5 blocks so that append() will re-open an existing block // instead of just adding a new one DFSTestUtil.createFile(fs, new Path("/test"), 4*BLOCK_SIZE + BLOCK_SIZE/2, (short) 3, 1L); // Roll edit log so that, when the SBN restarts, it will load // the namespace during startup. nn0.getRpcServer().rollEditLog(); banner("Restarting standby"); restartStandby(); // It will initially have all of the blocks necessary. assertSafeMode(nn1, 5, 5, 3, 0); // Append to a block while SBN is in safe mode. This should // not affect safemode initially, since the DN message // will get queued. FSDataOutputStream stm = fs.append(new Path("/test")); try { assertSafeMode(nn1, 5, 5, 3, 0); // if we roll edits now, the SBN should see that it's under construction // and change its total count and safe count down by one, since UC // blocks are not counted by safe mode. HATestUtil.waitForStandbyToCatchUp(nn0, nn1); assertSafeMode(nn1, 4, 4, 3, 0); } finally { IOUtils.closeStream(stm); } // Delete those blocks while the SBN is in safe mode. // This will not ACK the deletions to the SBN, so it won't // notice until we roll the edit log. banner("Removing the blocks without rolling the edit log"); fs.delete(new Path("/test"), true); BlockManagerTestUtil.computeAllPendingWork( nn0.getNamesystem().getBlockManager()); banner("Triggering deletions on DNs and Deletion Reports"); cluster.triggerHeartbeats(); HATestUtil.waitForDNDeletions(cluster); cluster.triggerDeletionReports(); assertSafeMode(nn1, 4, 4, 3, 0); // When we roll the edit log, the deletions will go through. banner("Waiting for standby to catch up to active namespace"); HATestUtil.waitForStandbyToCatchUp(nn0, nn1); assertSafeMode(nn1, 0, 0, 3, 0); } /** * Regression test for a bug experienced while developing * HDFS-2742. The scenario here is: * - image contains some blocks * - edits log contains at least one block addition, followed * by deletion of more blocks than were added. * - When node starts up, some incorrect accounting of block * totals caused an assertion failure. */ @Test public void testBlocksDeletedInEditLog() throws Exception { banner("Starting with NN0 active and NN1 standby, creating some blocks"); // Make 4 blocks persisted in the image. DFSTestUtil.createFile(fs, new Path("/test"), 4*BLOCK_SIZE, (short) 3, 1L); NameNodeAdapter.enterSafeMode(nn0, false); NameNodeAdapter.saveNamespace(nn0); NameNodeAdapter.leaveSafeMode(nn0); // OP_ADD for 2 blocks DFSTestUtil.createFile(fs, new Path("/test2"), 2*BLOCK_SIZE, (short) 3, 1L); // OP_DELETE for 4 blocks fs.delete(new Path("/test"), true); restartActive(); } private static void assertSafeMode(NameNode nn, int safe, int total, int numNodes, int nodeThresh) { String status = nn.getNamesystem().getSafemode(); if (safe == total) { assertTrue("Bad safemode status: '" + status + "'", status.startsWith( "Safe mode is ON. The reported blocks " + safe + " has reached the " + "threshold 0.9990 of total blocks " + total + ". The number of " + "live datanodes " + numNodes + " has reached the minimum number " + nodeThresh + ". Safe mode will be turned off automatically")); } else { int additional = total - safe; assertTrue("Bad safemode status: '" + status + "'", status.startsWith( "Safe mode is ON. " + "The reported blocks " + safe + " needs additional " + additional + " blocks")); } } /** * Set up a namesystem with several edits, both deletions and * additions, and failover to a new NN while that NN is in * safemode. Ensure that it will exit safemode. */ @Test public void testComplexFailoverIntoSafemode() throws Exception { banner("Starting with NN0 active and NN1 standby, creating some blocks"); DFSTestUtil.createFile(fs, new Path("/test"), 3*BLOCK_SIZE, (short) 3, 1L); // Roll edit log so that, when the SBN restarts, it will load // the namespace during startup and enter safemode. nn0.getRpcServer().rollEditLog(); banner("Creating some blocks that won't be in the edit log"); DFSTestUtil.createFile(fs, new Path("/test2"), 5*BLOCK_SIZE, (short) 3, 1L); banner("Deleting the original blocks"); fs.delete(new Path("/test"), true); banner("Restarting standby"); restartStandby(); // We expect it to be on its way out of safemode, since all of the blocks // from the edit log have been reported. assertSafeMode(nn1, 3, 3, 3, 0); // Initiate a failover into it while it's in safemode banner("Initiating a failover into NN1 in safemode"); NameNodeAdapter.abortEditLogs(nn0); cluster.transitionToActive(1); assertSafeMode(nn1, 5, 5, 3, 0); } /** * Similar to {@link #testBlocksRemovedWhileInSafeMode()} except that * the OP_DELETE edits arrive at the SBN before the block deletion reports. * The tracking of safe blocks needs to properly account for the removal * of the blocks as well as the safe count. This is a regression test for * HDFS-2742. */ @Test public void testBlocksRemovedWhileInSafeModeEditsArriveFirst() throws Exception { banner("Starting with NN0 active and NN1 standby, creating some blocks"); DFSTestUtil.createFile(fs, new Path("/test"), 10*BLOCK_SIZE, (short) 3, 1L); // Roll edit log so that, when the SBN restarts, it will load // the namespace during startup. nn0.getRpcServer().rollEditLog(); banner("Restarting standby"); restartStandby(); // It will initially have all of the blocks necessary. String status = nn1.getNamesystem().getSafemode(); assertTrue("Bad safemode status: '" + status + "'", status.startsWith( "Safe mode is ON. The reported blocks 10 has reached the threshold " + "0.9990 of total blocks 10. The number of live datanodes 3 has " + "reached the minimum number 0. Safe mode will be turned off " + "automatically")); // Delete those blocks while the SBN is in safe mode. // Immediately roll the edit log before the actual deletions are sent // to the DNs. banner("Removing the blocks without rolling the edit log"); fs.delete(new Path("/test"), true); HATestUtil.waitForStandbyToCatchUp(nn0, nn1); // Should see removal of the blocks as well as their contribution to safe block count. assertSafeMode(nn1, 0, 0, 3, 0); banner("Triggering sending deletions to DNs and Deletion Reports"); BlockManagerTestUtil.computeAllPendingWork( nn0.getNamesystem().getBlockManager()); cluster.triggerHeartbeats(); HATestUtil.waitForDNDeletions(cluster); cluster.triggerDeletionReports(); // No change in assertion status here, but some of the consistency checks // in safemode will fire here if we accidentally decrement safe block count // below 0. assertSafeMode(nn1, 0, 0, 3, 0); } /** * Test that the number of safe blocks is accounted correctly even when * blocks move between under-construction state and completed state. * If a FINALIZED report arrives at the SBN before the block is marked * COMPLETE, then when we get the OP_CLOSE we need to count it as "safe" * at that point. This is a regression test for HDFS-2742. */ @Test public void testSafeBlockTracking() throws Exception { banner("Starting with NN0 active and NN1 standby, creating some " + "UC blocks plus some other blocks to force safemode"); DFSTestUtil.createFile(fs, new Path("/other-blocks"), 10*BLOCK_SIZE, (short) 3, 1L); List<FSDataOutputStream> stms = Lists.newArrayList(); try { for (int i = 0; i < 5; i++) { FSDataOutputStream stm = fs.create(new Path("/test-uc-" + i)); stms.add(stm); stm.write(1); stm.hflush(); } // Roll edit log so that, when the SBN restarts, it will load // the namespace during startup and enter safemode. nn0.getRpcServer().rollEditLog(); } finally { for (FSDataOutputStream stm : stms) { IOUtils.closeStream(stm); } } banner("Restarting SBN"); restartStandby(); assertSafeMode(nn1, 10, 10, 3, 0); banner("Allowing SBN to catch up"); HATestUtil.waitForStandbyToCatchUp(nn0, nn1); assertSafeMode(nn1, 15, 15, 3, 0); } /** * Regression test for HDFS-2753. In this bug, the following sequence was * observed: * - Some blocks are written to DNs while the SBN was down. This causes * the blockReceived messages to get queued in the BPServiceActor on the * DN. * - When the SBN returns, the DN re-registers with the SBN, and then * flushes its blockReceived queue to the SBN before it sends its * first block report. This caused the first block report to be * incorrect ignored. * - The SBN would become stuck in safemode. */ @Test public void testBlocksAddedWhileStandbyIsDown() throws Exception { DFSTestUtil.createFile(fs, new Path("/test"), 3*BLOCK_SIZE, (short) 3, 1L); banner("Stopping standby"); cluster.shutdownNameNode(1); DFSTestUtil.createFile(fs, new Path("/test2"), 3*BLOCK_SIZE, (short) 3, 1L); banner("Rolling edit log so standby gets all edits on restart"); nn0.getRpcServer().rollEditLog(); restartStandby(); assertSafeMode(nn1, 6, 6, 3, 0); } /** * Regression test for HDFS-2804: standby should not populate replication * queues when exiting safe mode. */ @Test public void testNoPopulatingReplQueuesWhenExitingSafemode() throws Exception { DFSTestUtil.createFile(fs, new Path("/test"), 15*BLOCK_SIZE, (short)3, 1L); HATestUtil.waitForStandbyToCatchUp(nn0, nn1); // get some blocks in the SBN's image nn1.getRpcServer().setSafeMode(SafeModeAction.SAFEMODE_ENTER, false); NameNodeAdapter.saveNamespace(nn1); nn1.getRpcServer().setSafeMode(SafeModeAction.SAFEMODE_LEAVE, false); // and some blocks in the edit logs DFSTestUtil.createFile(fs, new Path("/test2"), 15*BLOCK_SIZE, (short)3, 1L); nn0.getRpcServer().rollEditLog(); cluster.stopDataNode(1); cluster.shutdownNameNode(1); //Configuration sbConf = cluster.getConfiguration(1); //sbConf.setInt(DFSConfigKeys.DFS_NAMENODE_SAFEMODE_EXTENSION_KEY, 1); cluster.restartNameNode(1, false); nn1 = cluster.getNameNode(1); GenericTestUtils.waitFor(new Supplier<Boolean>() { @Override public Boolean get() { return !nn1.isInSafeMode(); } }, 100, 10000); BlockManagerTestUtil.updateState(nn1.getNamesystem().getBlockManager()); assertEquals(0L, nn1.getNamesystem().getUnderReplicatedBlocks()); assertEquals(0L, nn1.getNamesystem().getPendingReplicationBlocks()); } /** * Make sure that when we transition to active in safe mode that we don't * prematurely consider blocks missing just because not all DNs have reported * yet. * * This is a regression test for HDFS-3921. */ @Test public void testNoPopulatingReplQueuesWhenStartingActiveInSafeMode() throws IOException { DFSTestUtil.createFile(fs, new Path("/test"), 15*BLOCK_SIZE, (short)3, 1L); // Stop the DN so that when the NN restarts not all blocks wil be reported // and the NN won't leave safe mode. cluster.stopDataNode(1); // Restart the namenode but don't wait for it to hear from all DNs (since // one DN is deliberately shut down.) cluster.restartNameNode(0, false); cluster.transitionToActive(0); assertTrue(cluster.getNameNode(0).isInSafeMode()); // We shouldn't yet consider any blocks "missing" since we're in startup // safemode, i.e. not all DNs may have reported. assertEquals(0, cluster.getNamesystem(0).getMissingBlocksCount()); } /** * Print a big banner in the test log to make debug easier. */ static void banner(String string) { LOG.info("\n\n\n\n================================================\n" + string + "\n" + "==================================================\n\n"); } /** * DFS#isInSafeMode should check the ActiveNNs safemode in HA enabled cluster. HDFS-3507 * * @throws Exception */ @Test public void testIsInSafemode() throws Exception { // Check for the standby nn without client failover. NameNode nn2 = cluster.getNameNode(1); assertTrue("nn2 should be in standby state", nn2.isStandbyState()); InetSocketAddress nameNodeAddress = nn2.getNameNodeAddress(); Configuration conf = new Configuration(); DistributedFileSystem dfs = new DistributedFileSystem(); try { dfs.initialize( URI.create("hdfs://" + nameNodeAddress.getHostName() + ":" + nameNodeAddress.getPort()), conf); dfs.isInSafeMode(); fail("StandBy should throw exception for isInSafeMode"); } catch (IOException e) { if (e instanceof RemoteException) { IOException sbExcpetion = ((RemoteException) e).unwrapRemoteException(); assertTrue("StandBy nn should not support isInSafeMode", sbExcpetion instanceof StandbyException); } else { throw e; } } finally { if (null != dfs) { dfs.close(); } } // Check with Client FailOver cluster.transitionToStandby(0); cluster.transitionToActive(1); cluster.getNameNodeRpc(1).setSafeMode(SafeModeAction.SAFEMODE_ENTER, false); DistributedFileSystem dfsWithFailOver = (DistributedFileSystem) fs; assertTrue("ANN should be in SafeMode", dfsWithFailOver.isInSafeMode()); cluster.getNameNodeRpc(1).setSafeMode(SafeModeAction.SAFEMODE_LEAVE, false); assertFalse("ANN should be out of SafeMode", dfsWithFailOver.isInSafeMode()); } }