/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hdfs.server.namenode.ha; import static org.junit.Assert.assertEquals; import java.io.IOException; import java.io.PrintWriter; import java.util.Collection; import java.util.List; import java.util.concurrent.CountDownLatch; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.impl.Log4JLogger; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hdfs.AppendTestUtil; import org.apache.hadoop.hdfs.DFSConfigKeys; import org.apache.hadoop.hdfs.DFSTestUtil; import org.apache.hadoop.hdfs.DFSUtil; import org.apache.hadoop.hdfs.MiniDFSCluster; import org.apache.hadoop.hdfs.MiniDFSNNTopology; import org.apache.hadoop.hdfs.protocol.Block; import org.apache.hadoop.hdfs.protocol.ExtendedBlock; import org.apache.hadoop.hdfs.protocolPB.DatanodeProtocolClientSideTranslatorPB; import org.apache.hadoop.hdfs.server.blockmanagement.BlockCollection; import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager; import org.apache.hadoop.hdfs.server.blockmanagement.BlockManagerTestUtil; import org.apache.hadoop.hdfs.server.blockmanagement.BlockPlacementPolicy; import org.apache.hadoop.hdfs.server.blockmanagement.BlockPlacementPolicyDefault; import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor; import org.apache.hadoop.hdfs.server.datanode.DataNode; import org.apache.hadoop.hdfs.server.datanode.DataNodeTestUtils; import org.apache.hadoop.hdfs.server.namenode.FSNamesystem; import org.apache.hadoop.hdfs.server.namenode.NameNode; import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter; import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration; import org.apache.hadoop.hdfs.server.protocol.StorageBlockReport; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.test.GenericTestUtils; import org.apache.hadoop.test.GenericTestUtils.DelayAnswer; import org.apache.log4j.Level; import org.junit.After; import org.junit.Before; import org.junit.Test; import org.mockito.Mockito; import org.mockito.invocation.InvocationOnMock; import com.google.common.base.Supplier; import com.google.common.collect.Lists; public class TestDNFencing { protected static final Log LOG = LogFactory.getLog(TestDNFencing.class); private static final String TEST_FILE = "/testStandbyIsHot"; private static final Path TEST_FILE_PATH = new Path(TEST_FILE); private static final int SMALL_BLOCK = 1024; private Configuration conf; private MiniDFSCluster cluster; private NameNode nn1, nn2; private FileSystem fs; static { ((Log4JLogger)LogFactory.getLog(FSNamesystem.class)).getLogger().setLevel(Level.ALL); ((Log4JLogger)LogFactory.getLog(BlockManager.class)).getLogger().setLevel(Level.ALL); ((Log4JLogger)NameNode.stateChangeLog).getLogger().setLevel(Level.ALL); } @Before public void setupCluster() throws Exception { conf = new Configuration(); conf.setInt(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, SMALL_BLOCK); // Bump up replication interval so that we only run replication // checks explicitly. conf.setInt(DFSConfigKeys.DFS_NAMENODE_REPLICATION_INTERVAL_KEY, 600); // Increase max streams so that we re-replicate quickly. conf.setInt(DFSConfigKeys.DFS_NAMENODE_REPLICATION_MAX_STREAMS_KEY, 1000); // See RandomDeleterPolicy javadoc. conf.setClass("dfs.block.replicator.classname", RandomDeleterPolicy.class, BlockPlacementPolicy.class); conf.setInt(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, 1); cluster = new MiniDFSCluster.Builder(conf) .nnTopology(MiniDFSNNTopology.simpleHATopology()) .numDataNodes(3) .build(); nn1 = cluster.getNameNode(0); nn2 = cluster.getNameNode(1); cluster.waitActive(); cluster.transitionToActive(0); // Trigger block reports so that the first NN trusts all // of the DNs, and will issue deletions cluster.triggerBlockReports(); fs = HATestUtil.configureFailoverFs(cluster, conf); } @After public void shutdownCluster() throws Exception { if (cluster != null) { banner("Shutting down cluster. NN1 metadata:"); doMetasave(nn1); banner("Shutting down cluster. NN2 metadata:"); doMetasave(nn2); cluster.shutdown(); } } @Test public void testDnFencing() throws Exception { // Create a file with replication level 3. DFSTestUtil.createFile(fs, TEST_FILE_PATH, 30*SMALL_BLOCK, (short)3, 1L); ExtendedBlock block = DFSTestUtil.getFirstBlock(fs, TEST_FILE_PATH); // Drop its replication count to 1, so it becomes over-replicated. // Then compute the invalidation of the extra blocks and trigger // heartbeats so the invalidations are flushed to the DNs. nn1.getRpcServer().setReplication(TEST_FILE, (short) 1); BlockManagerTestUtil.computeInvalidationWork( nn1.getNamesystem().getBlockManager()); cluster.triggerHeartbeats(); // Transition nn2 to active even though nn1 still thinks it's active. banner("Failing to NN2 but let NN1 continue to think it's active"); NameNodeAdapter.abortEditLogs(nn1); NameNodeAdapter.enterSafeMode(nn1, false); cluster.transitionToActive(1); // Check that the standby picked up the replication change. assertEquals(1, nn2.getRpcServer().getFileInfo(TEST_FILE).getReplication()); // Dump some info for debugging purposes. banner("NN2 Metadata immediately after failover"); doMetasave(nn2); // Even though NN2 considers the blocks over-replicated, it should // post-pone the block invalidation because the DNs are still "stale". assertEquals(30, nn2.getNamesystem().getPostponedMisreplicatedBlocks()); banner("Triggering heartbeats and block reports so that fencing is completed"); cluster.triggerHeartbeats(); cluster.triggerBlockReports(); banner("Metadata after nodes have all block-reported"); doMetasave(nn2); // The blocks should no longer be postponed. assertEquals(0, nn2.getNamesystem().getPostponedMisreplicatedBlocks()); // Wait for NN2 to enact its deletions (replication monitor has to run, etc) BlockManagerTestUtil.computeInvalidationWork( nn2.getNamesystem().getBlockManager()); cluster.triggerHeartbeats(); HATestUtil.waitForDNDeletions(cluster); cluster.triggerDeletionReports(); assertEquals(0, nn2.getNamesystem().getUnderReplicatedBlocks()); assertEquals(0, nn2.getNamesystem().getPendingReplicationBlocks()); banner("Making sure the file is still readable"); FileSystem fs2 = cluster.getFileSystem(1); DFSTestUtil.readFile(fs2, TEST_FILE_PATH); banner("Waiting for the actual block files to get deleted from DNs."); waitForTrueReplication(cluster, block, 1); } /** * Test case which restarts the standby node in such a way that, * when it exits safemode, it will want to invalidate a bunch * of over-replicated block replicas. Ensures that if we failover * at this point it won't lose data. */ @Test public void testNNClearsCommandsOnFailoverAfterStartup() throws Exception { // Make lots of blocks to increase chances of triggering a bug. DFSTestUtil.createFile(fs, TEST_FILE_PATH, 30*SMALL_BLOCK, (short)3, 1L); banner("Shutting down NN2"); cluster.shutdownNameNode(1); banner("Setting replication to 1, rolling edit log."); nn1.getRpcServer().setReplication(TEST_FILE, (short) 1); nn1.getRpcServer().rollEditLog(); // Start NN2 again. When it starts up, it will see all of the // blocks as over-replicated, since it has the metadata for // replication=1, but the DNs haven't yet processed the deletions. banner("Starting NN2 again."); cluster.restartNameNode(1); nn2 = cluster.getNameNode(1); banner("triggering BRs"); cluster.triggerBlockReports(); // We expect that both NN1 and NN2 will have some number of // deletions queued up for the DNs. banner("computing invalidation on nn1"); BlockManagerTestUtil.computeInvalidationWork( nn1.getNamesystem().getBlockManager()); banner("computing invalidation on nn2"); BlockManagerTestUtil.computeInvalidationWork( nn2.getNamesystem().getBlockManager()); // Dump some info for debugging purposes. banner("Metadata immediately before failover"); doMetasave(nn2); // Transition nn2 to active even though nn1 still thinks it's active banner("Failing to NN2 but let NN1 continue to think it's active"); NameNodeAdapter.abortEditLogs(nn1); NameNodeAdapter.enterSafeMode(nn1, false); cluster.transitionToActive(1); // Check that the standby picked up the replication change. assertEquals(1, nn2.getRpcServer().getFileInfo(TEST_FILE).getReplication()); // Dump some info for debugging purposes. banner("Metadata immediately after failover"); doMetasave(nn2); banner("Triggering heartbeats and block reports so that fencing is completed"); cluster.triggerHeartbeats(); cluster.triggerBlockReports(); banner("Metadata after nodes have all block-reported"); doMetasave(nn2); // The block should no longer be postponed. assertEquals(0, nn2.getNamesystem().getPostponedMisreplicatedBlocks()); // Wait for NN2 to enact its deletions (replication monitor has to run, etc) BlockManagerTestUtil.computeInvalidationWork( nn2.getNamesystem().getBlockManager()); HATestUtil.waitForNNToIssueDeletions(nn2); cluster.triggerHeartbeats(); HATestUtil.waitForDNDeletions(cluster); cluster.triggerDeletionReports(); assertEquals(0, nn2.getNamesystem().getUnderReplicatedBlocks()); assertEquals(0, nn2.getNamesystem().getPendingReplicationBlocks()); banner("Making sure the file is still readable"); FileSystem fs2 = cluster.getFileSystem(1); DFSTestUtil.readFile(fs2, TEST_FILE_PATH); } /** * Test case that reduces replication of a file with a lot of blocks * and then fails over right after those blocks enter the DN invalidation * queues on the active. Ensures that fencing is correct and no replicas * are lost. */ @Test public void testNNClearsCommandsOnFailoverWithReplChanges() throws Exception { // Make lots of blocks to increase chances of triggering a bug. DFSTestUtil.createFile(fs, TEST_FILE_PATH, 30*SMALL_BLOCK, (short)1, 1L); banner("rolling NN1's edit log, forcing catch-up"); HATestUtil.waitForStandbyToCatchUp(nn1, nn2); // Get some new replicas reported so that NN2 now considers // them over-replicated and schedules some more deletions nn1.getRpcServer().setReplication(TEST_FILE, (short) 2); while (BlockManagerTestUtil.getComputedDatanodeWork( nn1.getNamesystem().getBlockManager()) > 0) { LOG.info("Getting more replication work computed"); } BlockManager bm1 = nn1.getNamesystem().getBlockManager(); while (bm1.getPendingReplicationBlocksCount() > 0) { BlockManagerTestUtil.updateState(bm1); cluster.triggerHeartbeats(); Thread.sleep(1000); } banner("triggering BRs"); cluster.triggerBlockReports(); nn1.getRpcServer().setReplication(TEST_FILE, (short) 1); banner("computing invalidation on nn1"); BlockManagerTestUtil.computeInvalidationWork( nn1.getNamesystem().getBlockManager()); doMetasave(nn1); banner("computing invalidation on nn2"); BlockManagerTestUtil.computeInvalidationWork( nn2.getNamesystem().getBlockManager()); doMetasave(nn2); // Dump some info for debugging purposes. banner("Metadata immediately before failover"); doMetasave(nn2); // Transition nn2 to active even though nn1 still thinks it's active banner("Failing to NN2 but let NN1 continue to think it's active"); NameNodeAdapter.abortEditLogs(nn1); NameNodeAdapter.enterSafeMode(nn1, false); BlockManagerTestUtil.computeInvalidationWork( nn2.getNamesystem().getBlockManager()); cluster.transitionToActive(1); // Check that the standby picked up the replication change. assertEquals(1, nn2.getRpcServer().getFileInfo(TEST_FILE).getReplication()); // Dump some info for debugging purposes. banner("Metadata immediately after failover"); doMetasave(nn2); banner("Triggering heartbeats and block reports so that fencing is completed"); cluster.triggerHeartbeats(); cluster.triggerBlockReports(); banner("Metadata after nodes have all block-reported"); doMetasave(nn2); // The block should no longer be postponed. assertEquals(0, nn2.getNamesystem().getPostponedMisreplicatedBlocks()); // Wait for NN2 to enact its deletions (replication monitor has to run, etc) BlockManagerTestUtil.computeInvalidationWork( nn2.getNamesystem().getBlockManager()); HATestUtil.waitForNNToIssueDeletions(nn2); cluster.triggerHeartbeats(); HATestUtil.waitForDNDeletions(cluster); cluster.triggerDeletionReports(); assertEquals(0, nn2.getNamesystem().getUnderReplicatedBlocks()); assertEquals(0, nn2.getNamesystem().getPendingReplicationBlocks()); banner("Making sure the file is still readable"); FileSystem fs2 = cluster.getFileSystem(1); DFSTestUtil.readFile(fs2, TEST_FILE_PATH); } /** * Regression test for HDFS-2742. The issue in this bug was: * - DN does a block report while file is open. This BR contains * the block in RBW state. * - Standby queues the RBW state in PendingDatanodeMessages * - Standby processes edit logs during failover. Before fixing * this bug, it was mistakenly applying the RBW reported state * after the block had been completed, causing the block to get * marked corrupt. Instead, we should now be applying the RBW * message on OP_ADD, and then the FINALIZED message on OP_CLOSE. */ @Test public void testBlockReportsWhileFileBeingWritten() throws Exception { FSDataOutputStream out = fs.create(TEST_FILE_PATH); try { AppendTestUtil.write(out, 0, 10); out.hflush(); // Block report will include the RBW replica, but will be // queued on the StandbyNode. cluster.triggerBlockReports(); } finally { IOUtils.closeStream(out); } cluster.transitionToStandby(0); cluster.transitionToActive(1); // Verify that no replicas are marked corrupt, and that the // file is readable from the failed-over standby. BlockManagerTestUtil.updateState(nn1.getNamesystem().getBlockManager()); BlockManagerTestUtil.updateState(nn2.getNamesystem().getBlockManager()); assertEquals(0, nn1.getNamesystem().getCorruptReplicaBlocks()); assertEquals(0, nn2.getNamesystem().getCorruptReplicaBlocks()); DFSTestUtil.readFile(fs, TEST_FILE_PATH); } /** * Test that, when a block is re-opened for append, the related * datanode messages are correctly queued by the SBN because * they have future states and genstamps. */ @Test public void testQueueingWithAppend() throws Exception { int numQueued = 0; int numDN = cluster.getDataNodes().size(); FSDataOutputStream out = fs.create(TEST_FILE_PATH); try { AppendTestUtil.write(out, 0, 10); out.hflush(); // Opening the file will report RBW replicas, but will be // queued on the StandbyNode. numQueued += numDN; // RBW messages } finally { IOUtils.closeStream(out); numQueued += numDN; // blockReceived messages } cluster.triggerBlockReports(); numQueued += numDN; try { out = fs.append(TEST_FILE_PATH); AppendTestUtil.write(out, 10, 10); // RBW replicas once it's opened for append numQueued += numDN; } finally { IOUtils.closeStream(out); numQueued += numDN; // blockReceived } cluster.triggerBlockReports(); numQueued += numDN; assertEquals(numQueued, cluster.getNameNode(1).getNamesystem(). getPendingDataNodeMessageCount()); cluster.transitionToStandby(0); cluster.transitionToActive(1); // Verify that no replicas are marked corrupt, and that the // file is readable from the failed-over standby. BlockManagerTestUtil.updateState(nn1.getNamesystem().getBlockManager()); BlockManagerTestUtil.updateState(nn2.getNamesystem().getBlockManager()); assertEquals(0, nn1.getNamesystem().getCorruptReplicaBlocks()); assertEquals(0, nn2.getNamesystem().getCorruptReplicaBlocks()); AppendTestUtil.check(fs, TEST_FILE_PATH, 20); } /** * Another regression test for HDFS-2742. This tests the following sequence: * - DN does a block report while file is open. This BR contains * the block in RBW state. * - The block report is delayed in reaching the standby. * - The file is closed. * - The standby processes the OP_ADD and OP_CLOSE operations before * the RBW block report arrives. * - The standby should not mark the block as corrupt. */ @Test public void testRBWReportArrivesAfterEdits() throws Exception { final CountDownLatch brFinished = new CountDownLatch(1); DelayAnswer delayer = new GenericTestUtils.DelayAnswer(LOG) { @Override protected Object passThrough(InvocationOnMock invocation) throws Throwable { try { return super.passThrough(invocation); } finally { // inform the test that our block report went through. brFinished.countDown(); } } }; FSDataOutputStream out = fs.create(TEST_FILE_PATH); try { AppendTestUtil.write(out, 0, 10); out.hflush(); DataNode dn = cluster.getDataNodes().get(0); DatanodeProtocolClientSideTranslatorPB spy = DataNodeTestUtils.spyOnBposToNN(dn, nn2); Mockito.doAnswer(delayer) .when(spy).blockReport( Mockito.<DatanodeRegistration>anyObject(), Mockito.anyString(), Mockito.<StorageBlockReport[]>anyObject()); dn.scheduleAllBlockReport(0); delayer.waitForCall(); } finally { IOUtils.closeStream(out); } cluster.transitionToStandby(0); cluster.transitionToActive(1); delayer.proceed(); brFinished.await(); // Verify that no replicas are marked corrupt, and that the // file is readable from the failed-over standby. BlockManagerTestUtil.updateState(nn1.getNamesystem().getBlockManager()); BlockManagerTestUtil.updateState(nn2.getNamesystem().getBlockManager()); assertEquals(0, nn1.getNamesystem().getCorruptReplicaBlocks()); assertEquals(0, nn2.getNamesystem().getCorruptReplicaBlocks()); DFSTestUtil.readFile(fs, TEST_FILE_PATH); } /** * Print a big banner in the test log to make debug easier. */ private void banner(String string) { LOG.info("\n\n\n\n================================================\n" + string + "\n" + "==================================================\n\n"); } private void doMetasave(NameNode nn2) { nn2.getNamesystem().writeLock(); try { PrintWriter pw = new PrintWriter(System.err); nn2.getNamesystem().getBlockManager().metaSave(pw); pw.flush(); } finally { nn2.getNamesystem().writeUnlock(); } } private void waitForTrueReplication(final MiniDFSCluster cluster, final ExtendedBlock block, final int waitFor) throws Exception { GenericTestUtils.waitFor(new Supplier<Boolean>() { @Override public Boolean get() { try { return getTrueReplication(cluster, block) == waitFor; } catch (IOException e) { throw new RuntimeException(e); } } }, 500, 10000); } private int getTrueReplication(MiniDFSCluster cluster, ExtendedBlock block) throws IOException { int count = 0; for (DataNode dn : cluster.getDataNodes()) { if (DataNodeTestUtils.getFSDataset(dn).getStoredBlock( block.getBlockPoolId(), block.getBlockId()) != null) { count++; } } return count; } /** * A BlockPlacementPolicy which, rather than using space available, makes * random decisions about which excess replica to delete. This is because, * in the test cases, the two NNs will usually (but not quite always) * make the same decision of which replica to delete. The fencing issues * are exacerbated when the two NNs make different decisions, which can * happen in "real life" when they have slightly out-of-sync heartbeat * information regarding disk usage. */ public static class RandomDeleterPolicy extends BlockPlacementPolicyDefault { public RandomDeleterPolicy() { super(); } @Override public DatanodeDescriptor chooseReplicaToDelete(BlockCollection inode, Block block, short replicationFactor, Collection<DatanodeDescriptor> first, Collection<DatanodeDescriptor> second) { Collection<DatanodeDescriptor> chooseFrom = !first.isEmpty() ? first : second; List<DatanodeDescriptor> l = Lists.newArrayList(chooseFrom); return l.get(DFSUtil.getRandom().nextInt(l.size())); } } }