/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hdfs; import java.io.*; import java.util.Iterator; import java.util.Random; import java.net.*; import junit.framework.TestCase; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hdfs.protocol.Block; import org.apache.hadoop.hdfs.protocol.ClientProtocol; import org.apache.hadoop.hdfs.protocol.DatanodeInfo; import org.apache.hadoop.hdfs.protocol.FSConstants; import org.apache.hadoop.hdfs.protocol.LocatedBlock; import org.apache.hadoop.hdfs.protocol.LocatedBlocks; import org.apache.hadoop.hdfs.protocol.FSConstants.DatanodeReportType; import org.apache.hadoop.hdfs.server.datanode.BlockInlineChecksumWriter; import org.apache.hadoop.hdfs.server.datanode.DataNode; import org.apache.hadoop.hdfs.server.datanode.SimulatedFSDataset; import org.apache.hadoop.hdfs.server.namenode.BlockPlacementPolicy; import org.apache.hadoop.hdfs.server.namenode.BlockPlacementPolicyConfigurable; import org.apache.hadoop.hdfs.server.namenode.BlockPlacementPolicyDefault; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.BlockLocation; import static org.junit.Assert.*; import org.junit.Test; /** * This class tests the replication of a DFS file. */ public class TestReplication { private static final long seed = 0xDEADBEEFL; private static final int blockSize = 8192; private static final int fileSize = 16384; private static final String racks[] = new String[] { "/d1/r1", "/d1/r1", "/d1/r2", "/d1/r2", "/d1/r2", "/d2/r3", "/d2/r3", "/d2/r3" }; private static final int numDatanodes = racks.length; private static final Log LOG = LogFactory.getLog( "org.apache.hadoop.hdfs.TestReplication"); private static final String[] racks4 = new String[] { racks[0], racks[1], racks[2], racks[3] }; private void writeFile(FileSystem fileSys, Path name, int repl) throws IOException { // create and write a file that contains three blocks of data FSDataOutputStream stm = fileSys.create(name, true, fileSys.getConf().getInt("io.file.buffer.size", 4096), (short)repl, (long)blockSize); byte[] buffer = new byte[fileSize]; Random rand = new Random(seed); rand.nextBytes(buffer); stm.write(buffer); stm.close(); } /* check if there are at least two nodes are on the same rack */ private void checkFile(FileSystem fileSys, Path name, int repl) throws IOException { Configuration conf = fileSys.getConf(); ClientProtocol namenode = DFSClient.createNamenode(conf); waitForBlockReplication(name.toString(), namenode, Math.min(numDatanodes, repl), -1); LocatedBlocks locations = namenode.getBlockLocations(name.toString(),0, Long.MAX_VALUE); FileStatus stat = fileSys.getFileStatus(name); BlockLocation[] blockLocations = fileSys.getFileBlockLocations(stat,0L, Long.MAX_VALUE); // verify that rack locations match assertTrue(blockLocations.length == locations.locatedBlockCount()); for (int i = 0; i < blockLocations.length; i++) { LocatedBlock blk = locations.get(i); DatanodeInfo[] datanodes = blk.getLocations(); String[] topologyPaths = blockLocations[i].getTopologyPaths(); String[] racks = blockLocations[i].getRacks(); assertTrue(topologyPaths.length == datanodes.length); for (int j = 0; j < topologyPaths.length; j++) { boolean found = false; String matchedRack = null; for (int k = 0; k < racks.length; k++) { if (topologyPaths[j].startsWith(racks[k])) { found = true; matchedRack = racks[k]; break; } } assertTrue(found); assertEquals("Rack info should be equal", matchedRack, racks[j]); } } boolean isOnSameRack = true, isNotOnSameRack = true; for (LocatedBlock blk : locations.getLocatedBlocks()) { DatanodeInfo[] datanodes = blk.getLocations(); if (datanodes.length <= 1) break; if (datanodes.length == 2) { isNotOnSameRack = !(datanodes[0].getNetworkLocation().equals( datanodes[1].getNetworkLocation())); break; } isOnSameRack = false; isNotOnSameRack = false; for (int i = 0; i < datanodes.length-1; i++) { LOG.info("datanode "+ i + ": "+ datanodes[i].getName()); boolean onRack = false; for( int j=i+1; j<datanodes.length; j++) { if( datanodes[i].getNetworkLocation().equals( datanodes[j].getNetworkLocation()) ) { onRack = true; } } if (onRack) { isOnSameRack = true; } if (!onRack) { isNotOnSameRack = true; } if (isOnSameRack && isNotOnSameRack) break; } if (!isOnSameRack || !isNotOnSameRack) break; } assertTrue(isOnSameRack); if (conf.getClass("dfs.block.replicator.classname", null, BlockPlacementPolicy.class).equals( BlockPlacementPolicyConfigurable.class) && repl == 2) { // For BlockPlacementPolicyConfigurable we do in rack replication for r = // 2. assertFalse(isNotOnSameRack); } else { assertTrue(isNotOnSameRack); } } private void cleanupFile(FileSystem fileSys, Path name) throws IOException { assertTrue(fileSys.exists(name)); fileSys.delete(name, true); assertTrue(!fileSys.exists(name)); } /* * Test if Datanode reports bad blocks during replication request */ @Test public void testBadBlockReportOnTransfer() throws Exception { Configuration conf = new Configuration(); FileSystem fs = null; DFSClient dfsClient = null; LocatedBlocks blocks = null; int replicaCount = 0; MiniDFSCluster cluster = new MiniDFSCluster(conf, 2, true, null); cluster.waitActive(); fs = cluster.getFileSystem(); dfsClient = new DFSClient(new InetSocketAddress("localhost", cluster.getNameNodePort()), conf); // Create file with replication factor of 1 Path file1 = new Path("/tmp/testBadBlockReportOnTransfer/file1"); DFSTestUtil.createFile(fs, file1, 1024, (short)1, 0); DFSTestUtil.waitReplication(fs, file1, (short)1); // Corrupt the block belonging to the created file Block block = DFSTestUtil.getFirstBlock(fs, file1); cluster.corruptBlockOnDataNodes(block); // Increase replication factor, this should invoke transfer request // Receiving datanode fails on checksum and reports it to namenode fs.setReplication(file1, (short)2); // Now get block details and check if the block is corrupt blocks = dfsClient.namenode. getBlockLocations(file1.toString(), 0, Long.MAX_VALUE); while (blocks.get(0).isCorrupt() != true) { try { LOG.info("Waiting until block is marked as corrupt..."); Thread.sleep(1000); } catch (InterruptedException ie) { } blocks = dfsClient.namenode. getBlockLocations(file1.toString(), 0, Long.MAX_VALUE); } replicaCount = blocks.get(0).getLocations().length; assertTrue(replicaCount == 1); cluster.shutdown(); } /** * Tests replication in DFS. */ private void runReplication(boolean simulated, Class<? extends BlockPlacementPolicy> clazz) throws IOException { Configuration conf = new Configuration(); conf.setClass("dfs.block.replicator.classname", clazz, BlockPlacementPolicy.class); conf.setBoolean("dfs.replication.considerLoad", false); if (simulated) { conf.setBoolean(SimulatedFSDataset.CONFIG_PROPERTY_SIMULATED, true); } MiniDFSCluster cluster = null; FileSystem fileSys = null; DFSClient client = null; try { cluster = new MiniDFSCluster(conf, numDatanodes, racks, null, true, true); cluster.waitActive(); cluster.getNameNode().namesystem.refreshNodes(conf); client = new DFSClient(cluster.getNameNode().getNameNodeAddress(), conf); DatanodeInfo[] info = client.datanodeReport(DatanodeReportType.LIVE); assertEquals("Number of Datanodes ", numDatanodes, info.length); fileSys = cluster.getFileSystem(); Path file1 = new Path("/smallblocktest.dat"); writeFile(fileSys, file1, 3); checkFile(fileSys, file1, 3); cleanupFile(fileSys, file1); writeFile(fileSys, file1, 10); checkFile(fileSys, file1, 10); cleanupFile(fileSys, file1); writeFile(fileSys, file1, 4); checkFile(fileSys, file1, 4); cleanupFile(fileSys, file1); writeFile(fileSys, file1, 1); checkFile(fileSys, file1, 1); cleanupFile(fileSys, file1); writeFile(fileSys, file1, 2); checkFile(fileSys, file1, 2); cleanupFile(fileSys, file1); } finally { if(client != null) { client.close(); } if(fileSys != null) fileSys.close(); if (cluster != null) cluster.shutdown(); } } @Test public void testReplicationSimulatedStoragDefault() throws IOException { runReplication(true, BlockPlacementPolicyDefault.class); } @Test public void testReplicationDefault() throws IOException { runReplication(false, BlockPlacementPolicyDefault.class); } @Test public void testReplicationSimulatedStoragConfigurable() throws IOException { runReplication(true, BlockPlacementPolicyConfigurable.class); } @Test public void testReplicationConfigurable() throws IOException { runReplication(false, BlockPlacementPolicyConfigurable.class); } // Waits for all of the blocks to have expected replication private void waitForBlockReplication(String filename, ClientProtocol namenode, int expected, long maxWaitSec) throws IOException { waitForBlockReplication(filename, namenode, expected, maxWaitSec, false); } // Waits for all of the blocks to have expected replication private void waitForBlockReplication(String filename, ClientProtocol namenode, int expected, long maxWaitSec, boolean isUnderConstruction) throws IOException { long start = System.currentTimeMillis(); //wait for all the blocks to be replicated; LOG.info("Checking for block replication for " + filename); while (true) { boolean replOk = true; LocatedBlocks blocks = namenode.getBlockLocations(filename, 0, Long.MAX_VALUE); for (Iterator<LocatedBlock> iter = blocks.getLocatedBlocks().iterator(); iter.hasNext();) { LocatedBlock block = iter.next(); if (isUnderConstruction && !iter.hasNext()) { break; // do not check the last block } int actual = block.getLocations().length; if ( actual < expected ) { LOG.info("Not enough replicas for " + block.getBlock() + " yet. Expecting " + expected + ", got " + actual + "."); replOk = false; break; } } if (replOk) { return; } if (maxWaitSec > 0 && (System.currentTimeMillis() - start) > (maxWaitSec * 1000)) { throw new IOException("Timedout while waiting for all blocks to " + " be replicated for " + filename); } try { Thread.sleep(500); } catch (InterruptedException ignored) {} } } /* * This test makes sure that NameNode retries all the available blocks for * under replicated blocks. * * It creates a file with one block and replication of 4. It corrupts two of * the blocks and removes one of the replicas. Expected behaviour is that * missing replica will be copied from one valid source. */ @Test public void testPendingReplicationRetryDefault() throws IOException { runPendingReplicationRetry(BlockPlacementPolicyDefault.class); } @Test public void testPendingReplicationRetryConfigurable() throws IOException { runPendingReplicationRetry(BlockPlacementPolicyConfigurable.class); } /* This test makes sure that NameNode retries all the available blocks * for under replicated blocks. * * It creates a file with one block and replication of 4. It corrupts * two of the blocks and removes one of the replicas. Expected behaviour is * that missing replica will be copied from one valid source. */ private void runPendingReplicationRetry( Class<? extends BlockPlacementPolicy> clazz) throws IOException { pendingReplicationRetryInternal(false, clazz); } @Test public void testPendingReplicationRetryInlineChecksum() throws IOException { pendingReplicationRetryInternal(true, BlockPlacementPolicyDefault.class); } private void pendingReplicationRetryInternal(boolean inlineChecksum, Class<? extends BlockPlacementPolicy> clazz) throws IOException { MiniDFSCluster cluster = null; int numDataNodes = 4; String testFile = "/replication-test-file"; Path testPath = new Path(testFile); byte buffer[] = new byte[1024]; for (int i=0; i<buffer.length; i++) { buffer[i] = '1'; } try { Configuration conf = new Configuration(); conf.setClass("dfs.block.replicator.classname", clazz, BlockPlacementPolicy.class); conf.set("dfs.replication", Integer.toString(numDataNodes)); cluster = new MiniDFSCluster(conf, numDataNodes, racks4, null, true, true); cluster.waitActive(); for (DataNode dn : cluster.getDataNodes()) { dn.useInlineChecksum = inlineChecksum; } DFSClient dfsClient = new DFSClient(new InetSocketAddress("localhost", cluster.getNameNodePort()), conf); OutputStream out = cluster.getFileSystem().create(testPath); out.write(buffer); out.close(); waitForBlockReplication(testFile, dfsClient.namenode, numDataNodes, -1); // get first block of the file. Block block = dfsClient.namenode. getBlockLocations(testFile, 0, Long.MAX_VALUE). get(0).getBlock(); File[] blockFiles = new File[6]; for (int i=0; i<6; i++) { String fileName; if (!inlineChecksum) { fileName = block.getBlockName(); } else { fileName = BlockInlineChecksumWriter.getInlineChecksumFileName(block, FSConstants.CHECKSUM_TYPE, cluster.conf.getInt( "io.bytes.per.checksum", FSConstants.DEFAULT_BYTES_PER_CHECKSUM)); } blockFiles[i] = new File(cluster.getBlockDirectory("data" + (i+1)), fileName); } cluster.shutdown(); cluster = null; //Now mess up some of the replicas. //Delete the first and corrupt the next two. for (int i=0; i<25; i++) { buffer[i] = '0'; } int fileCount = 0; for (int i=0; i<6; i++) { File blockFile = blockFiles[i]; LOG.info("Checking for file " + blockFile); if (blockFile.exists()) { if (fileCount == 0) { LOG.info("Deleting file " + blockFile); assertTrue(blockFile.delete()); } else { // corrupt it. LOG.info("Corrupting file " + blockFile); long len = blockFile.length(); assertTrue(len > 50); RandomAccessFile blockOut = new RandomAccessFile(blockFile, "rw"); try { blockOut.seek(len/3); blockOut.write(buffer, 0, 25); } finally { blockOut.close(); } } fileCount++; } } assertEquals(3, fileCount); /* Start the MiniDFSCluster with more datanodes since once a writeBlock * to a datanode node fails, same block can not be written to it * immediately. In our case some replication attempts will fail. */ LOG.info("Restarting minicluster after deleting a replica and corrupting 2 crcs"); conf = new Configuration(); conf.setClass("dfs.block.replicator.classname", clazz, BlockPlacementPolicy.class); // first time format conf.set("dfs.replication", Integer.toString(numDataNodes)); conf.set("dfs.replication.pending.timeout.sec", Integer.toString(2)); conf.set("dfs.datanode.block.write.timeout.sec", Integer.toString(5)); conf.set("dfs.safemode.threshold.pct", "0.75f"); // only 3 copies exist conf.setBoolean("dfs.use.inline.checksum", !inlineChecksum); cluster = new MiniDFSCluster(conf, numDataNodes * 2, racks, null, true, true, false); cluster.waitActive(); dfsClient = new DFSClient(new InetSocketAddress("localhost", cluster.getNameNodePort()), conf); waitForBlockReplication(testFile, dfsClient.namenode, numDataNodes, -1); } finally { if (cluster != null) { cluster.shutdown(); } } } private void testReplicateLenMismatchedBlockInternal(boolean inlineChecksum) throws Exception { Configuration conf = new Configuration(); conf.setBoolean("dfs.use.inline.checksum", inlineChecksum); MiniDFSCluster cluster = new MiniDFSCluster(conf, 2, true, null); try { cluster.waitActive(); // test truncated block changeBlockLen(cluster, -1, inlineChecksum); // test extended block changeBlockLen(cluster, 1, inlineChecksum); } finally { cluster.shutdown(); } } /** * Test if replication can detect mismatched length on-disk blocks * @throws Exception */ @Test public void testReplicateLenMismatchedBlock() throws Exception { testReplicateLenMismatchedBlockInternal(false); } /** * Test if replication can detect mismatched length on-disk blocks * @throws Exception */ @Test public void testReplicateLenMismatchedBlockInlineChecksum() throws Exception { testReplicateLenMismatchedBlockInternal(true); } private void changeBlockLen(MiniDFSCluster cluster, int lenDelta, boolean isInlineChecksum) throws IOException, InterruptedException { final Path fileName = new Path("/file1"); final short REPLICATION_FACTOR = (short)1; final FileSystem fs = cluster.getFileSystem(); final int fileLen = fs.getConf().getInt("io.bytes.per.checksum", 512); DFSTestUtil.createFile(fs, fileName, fileLen, REPLICATION_FACTOR, 0); DFSTestUtil.waitReplication(fs, fileName, REPLICATION_FACTOR); String block; if (!isInlineChecksum) { block = DFSTestUtil.getFirstBlock(fs, fileName).getBlockName(); } else { block = BlockInlineChecksumWriter.getInlineChecksumFileName(DFSTestUtil .getFirstBlock(fs, fileName), FSConstants.CHECKSUM_TYPE, cluster.conf .getInt("io.bytes.per.checksum", FSConstants.DEFAULT_BYTES_PER_CHECKSUM)); } // Change the length of a replica for (int i=0; i<cluster.getDataNodes().size(); i++) { if (TestDatanodeBlockScanner.changeReplicaLength(block, i, lenDelta, cluster)) { break; } } // increase the file's replication factor fs.setReplication(fileName, (short)(REPLICATION_FACTOR+1)); // block replication triggers corrupt block detection DFSClient dfsClient = new DFSClient(new InetSocketAddress("localhost", cluster.getNameNodePort()), fs.getConf()); LocatedBlocks blocks = dfsClient.namenode.getBlockLocations( fileName.toString(), 0, fileLen); if (lenDelta < 0) { // replica truncated while (!blocks.get(0).isCorrupt() || REPLICATION_FACTOR != blocks.get(0).getLocations().length) { Thread.sleep(100); blocks = dfsClient.namenode.getBlockLocations( fileName.toString(), 0, fileLen); } } else { // no corruption detected; block replicated while (!blocks.get(0).isCorrupt() && REPLICATION_FACTOR +1 != blocks.get(0).getLocations().length) { Thread.sleep(100); blocks = dfsClient.namenode.getBlockLocations( fileName.toString(), 0, fileLen); } LOG.info("Block is " + (blocks.get(0).isCorrupt() ? "corrupted" : "healthy")); LOG.info("Replication number: " + blocks.get(0).getLocations().length); } fs.delete(fileName, true); } /* This test makes sure that the blocks except for last one in an under * construction file are replicated. * * It creates a file with one block and replication of 4. It corrupts * two of the blocks and removes one of the replicas. Expected behaviour is * that missing replica will be copied from one valid source. */ @Test public void testBlockReplicationInUCF() throws IOException { MiniDFSCluster cluster = null; short numDataNodes = 3; String testFile = "/replication-test-file"; Path testPath = new Path(testFile); byte buffer[] = new byte[1024]; for (int i=0; i<buffer.length; i++) { buffer[i] = '1'; } try { Configuration conf = new Configuration(); conf.set("dfs.replication", Integer.toString(numDataNodes-1)); conf.setLong("dfs.block.size", 1024L); cluster = new MiniDFSCluster(0, conf, numDataNodes, true, true, null, null); cluster.waitActive(); DFSClient dfsClient = new DFSClient(new InetSocketAddress("localhost", cluster.getNameNodePort()), conf); FileSystem fs = cluster.getFileSystem(); OutputStream out = fs.create(testPath); out.write(buffer); out.write(buffer); waitForBlockReplication(testFile, dfsClient.namenode, numDataNodes-1, -1, true); // bump this file's replication factor fs.setReplication(testPath, numDataNodes); waitForBlockReplication(testFile, dfsClient.namenode, numDataNodes, 300, true); } finally { if (cluster != null) { cluster.shutdown(); } } } /* * Test if rate cap takes effective */ @Test public void testRateCap() throws Exception { Configuration conf = new Configuration(); conf.setLong("dfs.data.transfer.max.bytes.per.sec", 128 * 1024); FileSystem fs = null; MiniDFSCluster cluster = new MiniDFSCluster(conf, 2, true, null); try { cluster.waitActive(); fs = cluster.getFileSystem(); // Create file with replication factor of 1 Path file1 = new Path("/tmp/testRateCap"); DFSTestUtil.createFile(fs, file1, 512 * 1024, (short) 1, 0); DFSTestUtil.waitReplication(fs, file1, (short) 1); // Make sure replication doesn't finish too fast. long startTime = System.currentTimeMillis(); fs.setReplication(file1, (short) 2); DFSTestUtil.waitReplication(fs, file1, (short) 2); long endTime = System.currentTimeMillis(); long length = endTime - startTime; System.out.println("Taking " + length + " ms to replicate."); TestCase.assertTrue(endTime - startTime > 3700); } finally { cluster.shutdown(); } } }