package org.apache.hadoop.hdfs.server.namenode; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.Iterator; import java.util.List; import java.util.Random; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hdfs.DFSClient; import org.apache.hadoop.hdfs.DistributedFileSystem; import org.apache.hadoop.hdfs.MiniDFSCluster; import org.apache.hadoop.hdfs.TestRaidDfs; import org.apache.hadoop.hdfs.protocol.Block; import org.apache.hadoop.hdfs.protocol.DatanodeInfo; import org.apache.hadoop.hdfs.protocol.LocatedBlock; import org.apache.hadoop.hdfs.protocol.LocatedBlocks; import org.apache.hadoop.hdfs.protocol.DatanodeInfo.AdminStates; import org.apache.hadoop.hdfs.protocol.FSConstants.DatanodeReportType; import org.apache.hadoop.raid.Codec; import org.apache.hadoop.raid.RaidNode; import org.apache.hadoop.raid.RaidUtils; import org.apache.hadoop.raid.TestDirectoryRaidDfs; import org.apache.hadoop.raid.Utils; import org.junit.Test; import junit.framework.Assert; import junit.framework.TestCase; public class TestDirectoryRaidBlockPlacement extends TestCase { final static String TEST_DIR = new File(System.getProperty("test.build.data", "build/contrib/raid/test/data")).getAbsolutePath(); private Configuration conf = null; private MiniDFSCluster cluster = null; private FSNamesystem namesystem = null; private BlockPlacementPolicyRaid policy = null; private FileSystem fs = null; private FileSystem localFileSys = null; private DistributedFileSystem dfs = null; private static long CAPACITY = 10240000L; private Path excludeFile; private Random rand = new Random(); String[] racks1 = {"/rack1", "/rack1", "/rack1"}; String[] hosts1 = {"host1.rack1.com", "host2.rack1.com", "host3.rack1.com"}; String[] racks2 = {"/rack1", "/rack2", "/rack2", "/rack2"}; String[] hosts2 = {"host1.rack1.com", "host2.rack2.com", "host3.rack2.com", "host4.rack2.com"}; final static Log LOG = LogFactory.getLog(TestDirectoryRaidBlockPlacement.class); private void cleanFile(Path p) throws IOException { File f = new File(p.toUri().getPath()); f.getParentFile().mkdirs(); if (f.exists()) { f.delete(); } f.createNewFile(); } private void writeConfigFile(Path name, ArrayList<String> nodes) throws IOException { // delete if it already exists if (localFileSys.exists(name)) { localFileSys.delete(name, true); } FSDataOutputStream stm = localFileSys.create(name); if (nodes != null) { for (Iterator<String> it = nodes.iterator(); it.hasNext();) { String node = it.next(); stm.writeBytes(node); stm.writeBytes("\n"); } } stm.close(); } protected void setupCluster(boolean simulated, long minFileSize, String[] racks, String[] hosts) throws IOException { conf = new Configuration(); localFileSys = FileSystem.getLocal(conf); conf.setLong("dfs.blockreport.intervalMsec", 1000L); conf.set("dfs.replication.pending.timeout.sec", "2"); conf.setLong("dfs.block.size", 1L); conf.set("dfs.block.replicator.classname", "org.apache.hadoop.hdfs.server.namenode.BlockPlacementPolicyRaid"); conf.setLong("hdfs.raid.min.filesize", minFileSize); Utils.loadTestCodecs(conf, 5, 5, 1, 3, "/raid", "/raidrs", false, true); conf.setInt("io.bytes.per.checksum", 1); excludeFile = new Path(TEST_DIR, "exclude" + System.currentTimeMillis()); cleanFile(excludeFile); conf.set("dfs.hosts.exclude", excludeFile.toUri().getPath()); writeConfigFile(excludeFile, null); if (!simulated) { cluster = new MiniDFSCluster(conf, hosts.length, true, racks, hosts); } else { long[] capacities = new long[]{CAPACITY, CAPACITY, CAPACITY}; cluster = new MiniDFSCluster(0, conf, hosts.length, true, true, null, racks, capacities); } cluster.waitActive(); namesystem = cluster.getNameNode().getNamesystem(); Assert.assertTrue("BlockPlacementPolicy type is not correct.", namesystem.replicator instanceof BlockPlacementPolicyRaid); policy = (BlockPlacementPolicyRaid) namesystem.replicator; fs = cluster.getFileSystem(); dfs = (DistributedFileSystem)fs; TestDirectoryRaidDfs.setupStripeStore(conf, fs); } protected void closeCluster() throws IOException { if (null != fs) { fs.close(); } if (null != cluster) { cluster.shutdown(); } } private int printLocatedBlocks(Path filePath) throws Exception { LocatedBlocks lbs = dfs.getLocatedBlocks(filePath, 0L, Integer.MAX_VALUE); StringBuilder sb = new StringBuilder(); sb.append("Path " + filePath + ":"); int maxRepl = 0; for (LocatedBlock lb: lbs.getLocatedBlocks()) { sb.append(lb.getBlock()); sb.append(":"); for (DatanodeInfo loc: lb.getLocations()) { sb.append(loc.getHostName()); sb.append(" "); } if (lb.getLocations().length > maxRepl) { maxRepl = lb.getLocations().length; } } LOG.info(sb.toString()); return maxRepl; } /* Get DFSClient to the namenode */ private static DFSClient getDfsClient(NameNode nn, Configuration conf) throws IOException { return new DFSClient(nn.getNameNodeAddress(), conf); } private void waitState(ArrayList<DatanodeInfo> infos, AdminStates state) throws Exception { long startTime = System.currentTimeMillis(); while (System.currentTimeMillis() - startTime < 60000) { int i = 0; for (DatanodeInfo di: infos) { if (di.getAdminState() == state) { i++; } } if (i == infos.size()) { return; } Thread.sleep(1000); } } /* * This test creates a directory with 3 files and its fake parity file. * We decommissioned all nodes in the rack2 to make sure all data are stored * in rack1 machine. * Then we bring rack2 machines to normal state and create a non-raided file * which is too small to be raided in the directory with 4 replicas * (1 in rack1 and 3 in rack2). * Then we reduce the replication to 3 to trigger chooseReplicatToDelete. * We verify remaining replicas has 1 in rack1 and 2 in rack2. */ @Test public void testChooseReplicasToDeleteForSmallFile() throws Exception { try { setupCluster(false, 512L, racks2, hosts2); // create test files int numFiles = 4; long blockSize = 1024L; String parentDir = "/dir/"; DFSClient client = getDfsClient(cluster.getNameNode(), conf); DatanodeInfo[] infos = client.datanodeReport(DatanodeReportType.LIVE); ArrayList<String> rack2nodes = new ArrayList<String>(); ArrayList<DatanodeInfo> rack2di = new ArrayList<DatanodeInfo>(); for (DatanodeInfo di: infos) { if (di.getHostName().contains("rack2")) { rack2nodes.add(di.getName()); rack2di.add(cluster.getNameNode().namesystem.getDatanode(di)); } } LOG.info("Decommission rack2 nodes"); writeConfigFile(excludeFile, rack2nodes); cluster.getNameNode().namesystem.refreshNodes(conf); waitState(rack2di, AdminStates.DECOMMISSIONED); for (int i = 0; i < numFiles; i++) { if (i == 2) { continue; } String file = parentDir + "file" + i; Path filePath = new Path(file); TestRaidDfs.createTestFile(fs, filePath, 1, 1, blockSize); printLocatedBlocks(filePath); } LOG.info("Created " + (numFiles - 1) + " files"); // create fake parity file Codec code = Codec.getCodec("xor"); long numStripes = RaidNode.numStripes(numFiles, code.stripeLength); Path parityPath = new Path(code.parityDirectory, "dir"); TestRaidDfs.createTestFile(fs, parityPath, 1, (int)numStripes * code.parityLength, blockSize); LOG.info("Create parity file: " + parityPath); printLocatedBlocks(parityPath); LOG.info("Bring back rack2 nodes out of decommission"); writeConfigFile(excludeFile, null); cluster.getNameNode().namesystem.refreshNodes(conf); waitState(rack2di, AdminStates.NORMAL); Path smallFilePath = new Path(parentDir + "file2"); TestRaidDfs.createTestFile(fs, smallFilePath, 4, 1, 256L); assertEquals("all datanodes should have replicas", hosts2.length, printLocatedBlocks(smallFilePath)); LOG.info("Created small file: " + smallFilePath); LOG.info("Reduce replication to 3"); dfs.setReplication(smallFilePath, (short)3); long startTime = System.currentTimeMillis(); while (System.currentTimeMillis() - startTime < 120000 && printLocatedBlocks(smallFilePath) == 4) { Thread.sleep(1000); } LocatedBlocks lbs = dfs.getLocatedBlocks(smallFilePath, 0L, Integer.MAX_VALUE); boolean hasRack1 = false; for (DatanodeInfo di: lbs.getLocatedBlocks().get(0).getLocations()) { if (di.getNetworkLocation().contains("rack1")) { hasRack1 = true; break; } } assertTrue("We should keep the nodes in rack1", hasRack1); } finally { closeCluster(); } } /* * This test start datanodes with simulated mode and keep running * chooseReplicaToDelete multiple times to get the average processing time * and number of allocated objects */ @Test public void testDirXORChooseReplicasToDeletePerformance() throws Exception { try { setupCluster(true, 1L, racks1, hosts1); // create test files int numFiles = 1000; long blockSize = 1024L; String parentDir = "/dir/"; for (int i = 0; i < numFiles; i++) { String file = parentDir + "file" + i; TestRaidDfs.createTestFile(fs, new Path(file), 3, 1, blockSize); } LOG.info("Created " + numFiles + " files"); Codec code = Codec.getCodec("xor"); FSNamesystem fsNameSys = cluster.getNameNode().namesystem; for (DatanodeDescriptor dd: fsNameSys.datanodeMap.values()) { LOG.info(dd); } // create fake parity file long numStripes = RaidNode.numStripes(numFiles, code.stripeLength); TestRaidDfs.createTestFile(fs, new Path(code.parityDirectory, "dir"), 3, (int)numStripes * code.parityLength, blockSize); long startTime = System.currentTimeMillis(); long total = 0L; fsNameSys.readLock(); for (BlocksMap.BlockInfo bi : fsNameSys.blocksMap.getBlocks()) { fsNameSys.replicator.chooseReplicaToDelete(bi.getINode(), bi, (short)3, fsNameSys.datanodeMap.values(), new ArrayList<DatanodeDescriptor>()); total++; } fsNameSys.readUnlock(); LOG.info("Average chooseReplicaToDelete time: " + ((double)(System.currentTimeMillis() - startTime) / total)); } finally { closeCluster(); } } @Test public void testGetCompanionBLocks() throws IOException { try { setupCluster(false, 1L, racks1, hosts1); String[] files = new String[] {"/dir/file1", "/dir/file2", "/dir/file3"}; Codec codec = Codec.getCodec("rs"); for (String file : files) { TestRaidDfs.createTestFile(fs, new Path(file), 3, 2, 8192L); } FileStatus stat = fs.getFileStatus(new Path("/dir")); RaidNode.doRaid(conf, stat, new Path(codec.parityDirectory), codec, new RaidNode.Statistics(), RaidUtils.NULL_PROGRESSABLE, false, 1, 1); Collection<LocatedBlock> companionBlocks; for (int i=0; i<2; i++) { for (int j=0; j<2; j++) { companionBlocks = getCompanionBlocks( namesystem, policy, getBlocks(namesystem, files[i]).get(j).getBlock()); Assert.assertEquals(8, companionBlocks.size()); } } companionBlocks = getCompanionBlocks( namesystem, policy, getBlocks(namesystem, files[2]).get(0).getBlock()); Assert.assertEquals(8, companionBlocks.size()); companionBlocks = getCompanionBlocks( namesystem, policy, getBlocks(namesystem, files[2]).get(1).getBlock()); Assert.assertEquals(4, companionBlocks.size()); String parityFile = "/raidrs/dir"; for (int i=0; i<3; i++) { companionBlocks = getCompanionBlocks( namesystem, policy, getBlocks(namesystem, parityFile).get(i).getBlock()); Assert.assertEquals(8, companionBlocks.size()); } for (int i=3; i<6; i++) { companionBlocks = getCompanionBlocks( namesystem, policy, getBlocks(namesystem, parityFile).get(i).getBlock()); Assert.assertEquals(4, companionBlocks.size()); } } finally { closeCluster(); } } private Collection<LocatedBlock> getCompanionBlocks( FSNamesystem namesystem, BlockPlacementPolicyRaid policy, Block block) throws IOException { INodeFile inode = namesystem.blocksMap.getINode(block); BlockPlacementPolicyRaid.FileInfo info = policy.getFileInfo(inode, inode.getFullPathName()); return policy.getCompanionBlocks(inode.getFullPathName(), info, block, inode); } private List<LocatedBlock> getBlocks(FSNamesystem namesystem, String file) throws IOException { FileStatus stat = namesystem.getFileInfo(file); return namesystem.getBlockLocations( file, 0, stat.getLen()).getLocatedBlocks(); } }