/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.raid; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNotSame; import java.io.File; import java.io.IOException; import java.net.URI; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Random; import java.util.Set; import junit.framework.TestCase; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hdfs.DFSUtil; import org.apache.hadoop.hdfs.DistributedFileSystem; import org.apache.hadoop.hdfs.MiniDFSCluster; import org.apache.hadoop.hdfs.RaidDFSUtil; import org.apache.hadoop.hdfs.TestDatanodeBlockScanner; import org.apache.hadoop.hdfs.TestRaidDfs; import org.apache.hadoop.hdfs.protocol.Block; import org.apache.hadoop.hdfs.protocol.LocatedBlocks; import org.apache.hadoop.mapred.MiniMRCluster; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.JobID; import org.apache.hadoop.raid.DistBlockIntegrityMonitor.Worker.LostFileInfo; import org.apache.hadoop.raid.LogUtils.LOGRESULTS; import org.apache.hadoop.raid.LogUtils.LOGTYPES; import org.apache.hadoop.raid.StripeStore.StripeInfo; import org.apache.hadoop.util.StringUtils; import org.junit.Test; public class TestDirectoryBlockFixer extends TestCase { final static Log LOG = LogFactory.getLog( "org.apache.hadoop.raid.TestDirectoryBlockFixer"); final static String TEST_DIR = new File(System.getProperty("test.build.data", "build/contrib/raid/test/data")).getAbsolutePath(); final static String CHECKSUM_STORE_DIR = new File(TEST_DIR, "ckm_store." + System.currentTimeMillis()).getAbsolutePath(); final static String STRIPE_STORE_DIR = new File(TEST_DIR, "stripe_store." + System.currentTimeMillis()).getAbsolutePath(); final static String CONFIG_FILE = new File(TEST_DIR, "test-raid.xml").getAbsolutePath(); final static long RELOAD_INTERVAL = 1000; final static int NUM_DATANODES = 3; final long blockSize = 8192L; final long[] fileSizes = new long[]{blockSize + blockSize/2, // block 0, 1 3*blockSize, // block 2, 3 blockSize + blockSize/2 + 1}; // block 4, 5, 6, 7 final long[] blockSizes = new long[]{blockSize, 2*blockSize, blockSize/2}; final Integer[] rsCorruptFileIdx1 = new Integer[]{0, 1, 2, 3, 5, 6, 7}; final int[] rsNumCorruptBlocksInFiles1 = new int[] {2, 2, 3}; final Integer[] rsCorruptFileIdx2 = new Integer[]{1, 3, 4, 5, 6}; final int[] rsNumCorruptBlocksInFiles2 = new int[] {1, 1, 3}; Configuration conf; String namenode = null; MiniDFSCluster dfsCluster = null; String hftp = null; MiniMRCluster mr = null; FileSystem fileSys = null; RaidNode cnode = null; String jobTrackerName = null; Random rand = new Random(); static { ParityFilePair.disableCacheUsedInTestOnly(); } public Configuration getRaidNodeConfig(Configuration conf, boolean local) { // create an instance of the RaidNode Configuration localConf = new Configuration(conf); localConf.setInt("raid.blockfix.interval", 1000); if (local) { localConf.set("raid.blockfix.classname", "org.apache.hadoop.raid.LocalBlockIntegrityMonitor"); } else { localConf.set("raid.blockfix.classname", "org.apache.hadoop.raid.DistBlockIntegrityMonitor"); } localConf.setLong("raid.blockfix.filespertask", 2L); return localConf; } @Test public void testDirectoryFilterUnfixableFiles() throws Exception { conf = new Configuration(); dfsCluster = new MiniDFSCluster(conf, NUM_DATANODES, true, null); dfsCluster.waitActive(); FileSystem fs = dfsCluster.getFileSystem(); Utils.loadTestCodecs(conf, 3, 5, 1, 3, "/destraid", "/destraidrs", false, true); try { Configuration testConf = fs.getConf(); BlockIntegrityMonitor blockFixer = new LocalBlockIntegrityMonitor(testConf, false); String p1 = "/user/foo/f1"; String p2 = "/user/foo/f2"; String p3 = "/user1/foo/bar/f1"; String p4 = "/a/b"; String p5 = "/c"; String p6 = "/destraidrs/user"; String p7 = "/destraid/user1/foo"; fs.mkdirs(new Path(p6)); List<String> fileList = new ArrayList<String>(); fileList.add(p1); fileList.add(p2); fileList.add(p3); fileList.add(p4); fileList.add(p5); blockFixer.filterUnreconstructableSourceFiles(fs, fileList.iterator()); // p3 and p5 should be filtered out. assertEquals(3, fileList.size()); Set<String> filtered = new HashSet<String>(); for (String p: fileList) filtered.add(p); assertFalse("File not filtered", filtered.contains(p3)); assertFalse("File not filtered", filtered.contains(p5)); fileList.add(p3); fs.mkdirs(new Path(p7)); blockFixer.filterUnreconstructableSourceFiles(fs, fileList.iterator()); // Nothing is filtered. assertEquals(4, fileList.size()); } finally { dfsCluster.shutdown(); } } @Test public void testDirBlockFixLocal() throws Exception { implDirBlockFix(true, true, false); } @Test public void testDirBlockFixDist() throws Exception { implDirBlockFix(false, true, false); } @Test public void testDirBlockFixDistWithoutStripeInfo() throws Exception { implDirBlockFix(false, false, false); } @Test public void testDirBlockFixDistWithStripeVerificationFailure() throws Exception { implDirBlockFix(false, true, true); } private long getTotal(int[] elements) { long totalCorruptBlocks = 0; for (int numBlk: elements) { totalCorruptBlocks += numBlk; } return totalCorruptBlocks; } /** * Create a file with three stripes, corrupt a block each in two stripes, * and wait for the the file to be fixed. */ private void implDirBlockFix(boolean local, boolean hasStripeInfo, boolean corruptStripe) throws Exception { LOG.info("Test testDirBlockFix started. local:" + local + " hasStripeInfo:" + hasStripeInfo + " corruptStripe:" + corruptStripe); int stripeLength = 3; mySetup(stripeLength); long[] crcs = new long[3]; int[] seeds = new int[3]; Path dirPath = new Path("/user/dhruba/raidtestrs"); Path[] files = TestRaidDfs.createTestFiles(dirPath, fileSizes, blockSizes, crcs, seeds, fileSys, (short)1); Path destPath = new Path("/destraidrs/user/dhruba"); LOG.info("Test testDirBlockFix created test files"); Configuration localConf = this.getRaidNodeConfig(conf, local); // Not allow multiple running jobs localConf.setLong("raid.blockfix.maxpendingjobs", 1L); try { cnode = RaidNode.createRaidNode(null, localConf); TestRaidDfs.waitForDirRaided(LOG, fileSys, dirPath, destPath); cnode.stop(); cnode.join(); DistributedFileSystem dfs = (DistributedFileSystem)fileSys; String[] corruptFiles = DFSUtil.getCorruptFiles(dfs); assertEquals("no corrupt files expected", 0, corruptFiles.length); assertEquals("filesFixed() should return 0 before fixing files", 0, cnode.blockIntegrityMonitor.getNumFilesFixed()); if (!hasStripeInfo) { // clear out all stripes LocalStripeStore lss = new LocalStripeStore(); lss.initialize(localConf, false, dfs); lss.clear(); } if (corruptStripe) { LocalStripeStore lss = new LocalStripeStore(); lss.initialize(localConf, false, dfs); Set<List<Block>> corruptCandidates = new HashSet<List<Block>>(lss.stripeSet.keySet()); for (List<Block> lb : corruptCandidates) { for (Codec codec : Codec.getCodecs()) { StripeInfo si = lss.getStripe(codec, lb.get(0)); if (si == null) { continue; } String oldSi = si.toString(); Collections.rotate(si.parityBlocks, 1); Collections.rotate(si.srcBlocks, 1); lss.putStripe(codec, si.parityBlocks, si.srcBlocks); String newSi = lss.getStripe(codec, lb.get(0)).toString(); LOG.info("Corrupt the stripe info old : " + oldSi + " new : " + newSi); } } } this.corruptFiles(dirPath, crcs, rsCorruptFileIdx1, dfs, files, rsNumCorruptBlocksInFiles1); cnode = RaidNode.createRaidNode(null, localConf); long start = System.currentTimeMillis(); while (cnode.blockIntegrityMonitor.getNumFilesFixed() < 3 && cnode.blockIntegrityMonitor.getNumFileFixFailures() < 3 && System.currentTimeMillis() - start < 120000) { LOG.info("Test testDirBlockFix waiting for files to be fixed."); Thread.sleep(1000); } long totalCorruptBlocks = getTotal(rsNumCorruptBlocksInFiles1); if (hasStripeInfo) { if (!corruptStripe) { TestBlockFixer.verifyMetrics(fileSys, cnode, local, 3L, totalCorruptBlocks); dfs = getDFS(conf, dfs); for (int i = 0; i < fileSizes.length; i++) { assertTrue("file " + files[i] + " not fixed", TestRaidDfs.validateFile(dfs, files[i], fileSizes[i], crcs[i])); } } else { TestBlockFixer.verifyMetrics(fileSys, cnode, local, 0L, 0L); assertTrue("should fail to fix more than 3 files", cnode.blockIntegrityMonitor.getNumFileFixFailures() >= 3L); TestBlockFixer.verifyMetrics(fileSys, cnode, LOGTYPES.OFFLINE_RECONSTRUCTION_FILE, LOGRESULTS.FAILURE, 3L, true); // Will throw stripe mismatch exception for the first blocks of 3 files TestBlockFixer.verifyMetrics(fileSys, cnode, LOGTYPES.OFFLINE_RECONSTRUCTION_STRIPE_VERIFICATION, LOGRESULTS.FAILURE, 3L, true); } } else { TestBlockFixer.verifyMetrics(fileSys, cnode, local, 0L, 0L); assertTrue("should fail to fix more than 3 files", cnode.blockIntegrityMonitor.getNumFileFixFailures() >= 3L); TestBlockFixer.verifyMetrics(fileSys, cnode, LOGTYPES.OFFLINE_RECONSTRUCTION_GET_STRIPE, LOGRESULTS.FAILURE, totalCorruptBlocks, true); TestBlockFixer.verifyMetrics(fileSys, cnode, LOGTYPES.OFFLINE_RECONSTRUCTION_FILE, LOGRESULTS.FAILURE, 3L, true); } } catch (Exception e) { LOG.info("Test testDirBlockFix Exception " + e, e); throw e; } finally { myTearDown(); } LOG.info("Test testDirBlockFix completed."); } /** * Tests integrity of generated block. * Create a file and delete a block entirely. Wait for the block to be * regenerated. Now stop RaidNode and corrupt the generated block. * Test that corruption in the generated block can be detected by clients. */ private void generatedBlockTestCommon(String testName, int blockToCorrupt, boolean local) throws Exception { LOG.info("Test " + testName + " started."); int stripeLength = 3; mySetup(stripeLength); long[] crcs = new long[3]; int[] seeds = new int[3]; Path dirPath = new Path("/user/dhruba/raidtest"); Path[] files = TestRaidDfs.createTestFiles(dirPath, fileSizes, blockSizes, crcs, seeds, fileSys, (short)1); Path destPath = new Path("/destraid/user/dhruba"); LOG.info("Test " + testName + " created test files"); Configuration localConf = this.getRaidNodeConfig(conf, local); try { cnode = RaidNode.createRaidNode(null, localConf); TestRaidDfs.waitForDirRaided(LOG, fileSys, dirPath, destPath); cnode.stop(); cnode.join(); DistributedFileSystem dfs = (DistributedFileSystem)fileSys; String[] corruptFiles = DFSUtil.getCorruptFiles(dfs); assertEquals("no corrupt files expected", 0, corruptFiles.length); assertEquals("filesFixed() should return 0 before fixing files", 0, cnode.blockIntegrityMonitor.getNumFilesFixed()); Integer[] corruptBlockIdxs = new Integer[]{blockToCorrupt}; TestDirectoryRaidDfs.corruptBlocksInDirectory(conf, dirPath, crcs, corruptBlockIdxs, fileSys, dfsCluster, false, true); corruptFiles = DFSUtil.getCorruptFiles(dfs); assertEquals("files not corrupted", corruptBlockIdxs.length, corruptFiles.length); int corruptFileIdx = -1; for (int i = 0; i < files.length; i++) { if (files[i].toUri().getPath().equals(corruptFiles[0])) { corruptFileIdx = i; break; } } assertNotSame("Wrong corrupt file", -1, corruptFileIdx); cnode = RaidNode.createRaidNode(null, localConf); long start = System.currentTimeMillis(); while (cnode.blockIntegrityMonitor.getNumFilesFixed() < 1 && System.currentTimeMillis() - start < 120000) { LOG.info("Test testDirBlockFix waiting for files to be fixed."); Thread.sleep(1000); } TestBlockFixer.verifyMetrics(fileSys, cnode, local, 1L, corruptBlockIdxs.length); // Stop RaidNode cnode.stop(); cnode.join(); cnode = null; // The block has successfully been reconstructed. dfs = getDFS(conf, dfs); assertTrue("file not fixed", TestRaidDfs.validateFile(dfs, files[corruptFileIdx], fileSizes[corruptFileIdx], crcs[corruptFileIdx])); // Now corrupt the generated block. TestDirectoryRaidDfs.corruptBlocksInDirectory(conf, dirPath, crcs, corruptBlockIdxs, dfs, dfsCluster, false, false); try { TestRaidDfs.validateFile(dfs, files[corruptFileIdx], fileSizes[corruptFileIdx], crcs[corruptFileIdx]); fail("Expected exception not thrown"); } catch (org.apache.hadoop.fs.ChecksumException ce) { } catch (org.apache.hadoop.fs.BlockMissingException bme) { } } catch (Exception e) { LOG.info("Test " + testName + " Exception " + e, e); throw e; } finally { myTearDown(); } LOG.info("Test " + testName + " completed."); } /** * Tests integrity of generated block. * Create a file and delete a block entirely. Wait for the block to be * regenerated. Now stop RaidNode and corrupt the generated block. * Test that corruption in the generated block can be detected by clients. */ @Test public void testGeneratedBlockLocal() throws Exception { generatedBlockTestCommon("testGeneratedBlock", 2, true); } /** * Tests integrity of generated block. * Create a file and delete a block entirely. Wait for the block to be * regenerated. Now stop RaidNode and corrupt the generated block. * Test that corruption in the generated block can be detected by clients. */ @Test public void testGeneratedBlockDist() throws Exception { generatedBlockTestCommon("testGeneratedBlock", 2, false); } /** * Tests integrity of generated last block. * Create a file and delete a block entirely. Wait for the block to be * regenerated. Now stop RaidNode and corrupt the generated block. * Test that corruption in the generated block can be detected by clients. */ @Test public void testGeneratedLastBlockLocal() throws Exception { generatedBlockTestCommon("testGeneratedLastBlock", -1, true); } /** * Tests integrity of generated last block. * Create a file and delete a block entirely. Wait for the block to be * regenerated. Now stop RaidNode and corrupt the generated block. * Test that corruption in the generated block can be detected by clients. */ @Test public void testGeneratedLastBlockDist() throws Exception { generatedBlockTestCommon("testGeneratedLastBlock", -1, false); } @Test public void testParityBlockFixLocal() throws Exception { implParityBlockFix("testParityBlockFixLocal", true); } @Test public void testParityBlockFixDist() throws Exception { implParityBlockFix("testParityBlockFixDist", false); } /** * Corrupt a parity file and wait for it to get fixed. */ private void implParityBlockFix(String testName, boolean local) throws Exception { LOG.info("Test " + testName + " started."); int stripeLength = 3; mySetup(stripeLength); long[] crcs = new long[3]; int[] seeds = new int[3]; Path dirPath = new Path("/user/dhruba/raidtest"); Path[] files = TestRaidDfs.createTestFiles(dirPath, fileSizes, blockSizes, crcs, seeds, fileSys, (short)1); Path destPath = new Path("/destraid/user/dhruba"); Path parityFile = new Path("/destraid/user/dhruba/raidtest"); LOG.info("Test " + testName + " created test files"); Configuration localConf = this.getRaidNodeConfig(conf, local); try { cnode = RaidNode.createRaidNode(null, localConf); TestRaidDfs.waitForDirRaided(LOG, fileSys, dirPath, destPath); cnode.stop(); cnode.join(); long parityCRC = RaidDFSUtil.getCRC(fileSys, parityFile); FileStatus parityStat = fileSys.getFileStatus(parityFile); DistributedFileSystem dfs = (DistributedFileSystem)fileSys; LocatedBlocks locs = RaidDFSUtil.getBlockLocations( dfs, parityFile.toUri().getPath(), 0, parityStat.getLen()); String[] corruptFiles = DFSUtil.getCorruptFiles(dfs); assertEquals("no corrupt files expected", 0, corruptFiles.length); assertEquals("filesFixed() should return 0 before fixing files", 0, cnode.blockIntegrityMonitor.getNumFilesFixed()); // Corrupt parity blocks for different stripes. int[] corruptBlockIdxs = new int[]{0, 1, 2}; for (int idx: corruptBlockIdxs) corruptBlock(locs.get(idx).getBlock(), dfsCluster); RaidDFSUtil.reportCorruptBlocks(dfs, parityFile, corruptBlockIdxs, 2*blockSize); corruptFiles = DFSUtil.getCorruptFiles(dfs); assertEquals("file not corrupted", 1, corruptFiles.length); assertEquals("wrong file corrupted", corruptFiles[0], parityFile.toUri().getPath()); cnode = RaidNode.createRaidNode(null, localConf); long start = System.currentTimeMillis(); while (cnode.blockIntegrityMonitor.getNumFilesFixed() < 1 && System.currentTimeMillis() - start < 120000) { LOG.info("Test " + testName + " waiting for files to be fixed."); Thread.sleep(3000); } TestBlockFixer.verifyMetrics(fileSys, cnode, local, 1L, corruptBlockIdxs.length); long checkCRC = RaidDFSUtil.getCRC(fileSys, parityFile); assertEquals("file not fixed", parityCRC, checkCRC); } catch (Exception e) { LOG.info("Test " + testName + " Exception " + e + StringUtils.stringifyException(e)); throw e; } finally { myTearDown(); } LOG.info("Test " + testName + " completed."); } private void corruptFiles(Path dirPath, long[] crcs, Integer[] corruptBlockIdxs, DistributedFileSystem dfs, Path[] files, int[] numCorruptBlocksInFiles) throws IOException { int totalCorruptFiles = DFSUtil.getCorruptFiles(dfs).length; TestDirectoryRaidDfs.corruptBlocksInDirectory(conf, dirPath, crcs, corruptBlockIdxs, fileSys, dfsCluster, false, true); String[] corruptFiles = DFSUtil.getCorruptFiles(dfs); for (int i = 0; i < numCorruptBlocksInFiles.length; i++) { if (numCorruptBlocksInFiles[i] > 0) totalCorruptFiles++; } assertEquals("files not corrupted", totalCorruptFiles, corruptFiles.length); for (int i = 0; i< fileSizes.length; i++) { assertEquals("wrong number of corrupt blocks for file " + files[i], numCorruptBlocksInFiles[i], RaidDFSUtil.corruptBlocksInFile(dfs, files[i].toUri().getPath(), 0, fileSizes[i]).size()); } } /** * tests that we can have 2 concurrent jobs fixing files * (dist block fixer) */ @Test public void testConcurrentJobs() throws Exception { LOG.info("Test testConcurrentJobs started."); int stripeLength = 3; mySetup(stripeLength); long[] crcs1 = new long[3]; int[] seeds1 = new int[3]; long[] crcs2 = new long[3]; int[] seeds2 = new int[3]; Path dirPath1 = new Path("/user/dhruba/raidtestrs/1"); Path[] files1 = TestRaidDfs.createTestFiles(dirPath1, fileSizes, blockSizes, crcs1, seeds1, fileSys, (short)1); Path dirPath2 = new Path("/user/dhruba/raidtestrs/2"); Path[] files2 = TestRaidDfs.createTestFiles(dirPath2, fileSizes, blockSizes, crcs2, seeds2, fileSys, (short)1); Path destPath = new Path("/destraidrs/user/dhruba/raidtestrs"); LOG.info("Test testConcurrentJobs created test files"); Configuration localConf = this.getRaidNodeConfig(conf, false); localConf.setLong(BlockIntegrityMonitor.BLOCKCHECK_INTERVAL, 15000L); localConf.setLong( DistBlockIntegrityMonitor.RAIDNODE_BLOCK_FIX_SUBMISSION_INTERVAL_KEY, 15000L); localConf.setLong( DistBlockIntegrityMonitor.RAIDNODE_BLOCK_FIX_SCAN_SUBMISSION_INTERVAL_KEY, 3600000); try { cnode = RaidNode.createRaidNode(null, localConf); TestRaidDfs.waitForDirRaided(LOG, fileSys, dirPath1, destPath); TestRaidDfs.waitForDirRaided(LOG, fileSys, dirPath2, destPath); cnode.stop(); cnode.join(); DistributedFileSystem dfs = (DistributedFileSystem)fileSys; String[] corruptFiles = DFSUtil.getCorruptFiles(dfs); assertEquals("no corrupt files expected", 0, corruptFiles.length); assertEquals("filesFixed() should return 0 before fixing files", 0, cnode.blockIntegrityMonitor.getNumFilesFixed()); //corrupt directory 1 this.corruptFiles(dirPath1, crcs1, rsCorruptFileIdx1, dfs, files1, rsNumCorruptBlocksInFiles1); cnode = RaidNode.createRaidNode(null, localConf); DistBlockIntegrityMonitor blockFixer = (DistBlockIntegrityMonitor) cnode.blockIntegrityMonitor; long start = System.currentTimeMillis(); // All files are HIGH-PRI corrupt files while (blockFixer.jobsRunning() < 1 && System.currentTimeMillis() - start < 60000) { LOG.info("Test testDirBlockFix waiting for fixing job 1 to start"); Thread.sleep(1000); } assertEquals("job 1 not running", 1, blockFixer.jobsRunning()); // Corrupt directory 2 this.corruptFiles(dirPath2, crcs2, rsCorruptFileIdx2, dfs, files2, rsNumCorruptBlocksInFiles2); // 1 LOW-PRI file and 2 HIGH-PRI files while (blockFixer.jobsRunning() < 3 && System.currentTimeMillis() - start < 60000) { LOG.info("Test testDirBlockFix waiting for fixing job 2 and 3 to start"); Thread.sleep(1000); } assertTrue("more than 3 jobs are running", blockFixer.jobsRunning() >= 3); while (blockFixer.getNumFilesFixed() < 6 && System.currentTimeMillis() - start < 240000) { LOG.info("Test testDirBlockFix waiting for files to be fixed."); Thread.sleep(1000); } TestBlockFixer.verifyMetrics(fileSys, cnode, false, 6L, getTotal(rsNumCorruptBlocksInFiles1) + getTotal(rsNumCorruptBlocksInFiles2)); dfs = getDFS(conf, dfs); for (int i = 0; i < fileSizes.length; i++) { assertTrue("file " + files1[i] + " not fixed", TestRaidDfs.validateFile(dfs, files1[i], fileSizes[i], crcs1[i])); } for (int i = 0; i < fileSizes.length; i++) { assertTrue("file " + files2[i] + " not fixed", TestRaidDfs.validateFile(dfs, files2[i], fileSizes[i], crcs2[i])); } } catch (Exception e) { LOG.info("Test testConcurrentJobs exception " + e, e); throw e; } finally { myTearDown(); } } /** * tests that the distributed block fixer obeys * the limit on how many jobs to submit simultaneously. */ @Test public void testMaxPendingJobs() throws Exception { LOG.info("Test testMaxPendingJobs started."); int stripeLength = 3; mySetup(stripeLength); long[] crcs1 = new long[3]; int[] seeds1 = new int[3]; long[] crcs2 = new long[3]; int[] seeds2 = new int[3]; Path dirPath1 = new Path("/user/dhruba/raidtestrs/1"); Path[] files1 = TestRaidDfs.createTestFiles(dirPath1, fileSizes, blockSizes, crcs1, seeds1, fileSys, (short)1); Path dirPath2 = new Path("/user/dhruba/raidtestrs/2"); Path[] files2 = TestRaidDfs.createTestFiles(dirPath2, fileSizes, blockSizes, crcs2, seeds2, fileSys, (short)1); Path destPath = new Path("/destraidrs/user/dhruba/raidtestrs"); LOG.info("Test testMaxPendingJobs created test files"); Configuration localConf = this.getRaidNodeConfig(conf, false); localConf.setLong("raid.blockfix.maxpendingjobs", 1L); try { cnode = RaidNode.createRaidNode(null, localConf); TestRaidDfs.waitForDirRaided(LOG, fileSys, dirPath1, destPath); TestRaidDfs.waitForDirRaided(LOG, fileSys, dirPath2, destPath); cnode.stop(); cnode.join(); DistributedFileSystem dfs = (DistributedFileSystem)fileSys; String[] corruptFiles = DFSUtil.getCorruptFiles(dfs); assertEquals("no corrupt files expected", 0, corruptFiles.length); assertEquals("filesFixed() should return 0 before fixing files", 0, cnode.blockIntegrityMonitor.getNumFilesFixed()); //corrupt directory 1 this.corruptFiles(dirPath1, crcs1, rsCorruptFileIdx1, dfs, files1, rsNumCorruptBlocksInFiles1); cnode = RaidNode.createRaidNode(null, localConf); DistBlockIntegrityMonitor blockFixer = (DistBlockIntegrityMonitor) cnode.blockIntegrityMonitor; long start = System.currentTimeMillis(); while (blockFixer.jobsRunning() < 1 && System.currentTimeMillis() - start < 60000) { LOG.info("Test testDirBlockFix waiting for fixing job 1 to start"); Thread.sleep(1000); } assertEquals("job not running", 1, blockFixer.jobsRunning()); //corrupt directory 2 this.corruptFiles(dirPath2, crcs2, rsCorruptFileIdx2, dfs, files2, rsNumCorruptBlocksInFiles2); // wait until both files are fixed while (blockFixer.getNumFilesFixed() < 6 && System.currentTimeMillis() - start < 240000) { // make sure the block fixer does not start a second job while // the first one is still running assertTrue("too many jobs running", blockFixer.jobsRunning() <= 1); Thread.sleep(1000); } TestBlockFixer.verifyMetrics(fileSys, cnode, false, 6L, getTotal(rsNumCorruptBlocksInFiles1) + getTotal(rsNumCorruptBlocksInFiles2)); dfs = getDFS(conf, dfs); for (int i = 0; i < fileSizes.length; i++) { assertTrue("file " + files1[i] + " not fixed", TestRaidDfs.validateFile(dfs, files1[i], fileSizes[i], crcs1[i])); } for (int i = 0; i < fileSizes.length; i++) { assertTrue("file " + files2[i] + " not fixed", TestRaidDfs.validateFile(dfs, files2[i], fileSizes[i], crcs2[i])); } } catch (Exception e) { LOG.info("Test testMaxPendingJobs exception " + e + StringUtils.stringifyException(e)); throw e; } finally { myTearDown(); } } static class FakeDistBlockIntegrityMonitor extends DistBlockIntegrityMonitor { Map<String, List<String>> submittedJobs = new HashMap<String, List<String>>(); FakeDistBlockIntegrityMonitor(Configuration conf) throws Exception { super(conf); } @Override void submitJob(Job job, List<String> filesInJob, Priority priority, Map<Job, List<LostFileInfo>> jobIndex, Map<JobID, TrackingUrlInfo> idToTrackingUrlMap) { LOG.info("Job " + job.getJobName() + " was submitted "); submittedJobs.put(job.getJobName(), filesInJob); } } public void testMultiplePriorities() throws Exception { long[] crcs = new long[3]; int[] seeds = new int[3]; Path dirPath = new Path("/home/test"); int stripeLength = 3; short repl = 1; mySetup(stripeLength); Codec codec = Codec.getCodec("rs"); LOG.info("Starting testMultiplePriorities"); try { // Create test file and raid it. Path[] files = TestRaidDfs.createTestFiles(dirPath, fileSizes, blockSizes, crcs, seeds, fileSys, (short)1); FileStatus stat = fileSys.getFileStatus(dirPath); RaidNode.doRaid(conf, stat, new Path(codec.parityDirectory), codec, new RaidNode.Statistics(), RaidUtils.NULL_PROGRESSABLE, false, repl, repl); Integer[] corruptBlockIdxs = new Integer[]{0, 2}; LOG.info("Corrupt block " + corruptBlockIdxs + " of directory " + dirPath); TestDirectoryRaidDfs.corruptBlocksInDirectory(conf, dirPath, crcs, corruptBlockIdxs, fileSys, dfsCluster, false, true); // Create Block Fixer and fix. FakeDistBlockIntegrityMonitor distBlockFixer = new FakeDistBlockIntegrityMonitor(conf); assertEquals(0, distBlockFixer.submittedJobs.size()); // waiting for one job to submit long startTime = System.currentTimeMillis(); while (System.currentTimeMillis() - startTime < 120000 && distBlockFixer.submittedJobs.size() == 0) { distBlockFixer.getCorruptionMonitor().checkAndReconstructBlocks(); LOG.info("Waiting for jobs to submit"); Thread.sleep(10000); } int submittedJob = distBlockFixer.submittedJobs.size(); LOG.info("Already Submitted " + submittedJob + " jobs"); assertTrue("Should submit more than 1 jobs", submittedJob >= 1); // Corrupt two more blocks corruptBlockIdxs = new Integer[]{4, 5}; LOG.info("Corrupt block " + corruptBlockIdxs + " of directory " + dirPath); TestDirectoryRaidDfs.corruptBlocksInDirectory(conf, dirPath, crcs, corruptBlockIdxs, fileSys, dfsCluster, false, true); // A new job should be submitted since two blocks are corrupt. startTime = System.currentTimeMillis(); while (System.currentTimeMillis() - startTime < 120000 && distBlockFixer.submittedJobs.size() == submittedJob) { distBlockFixer.getCorruptionMonitor().checkAndReconstructBlocks(); LOG.info("Waiting for more jobs to submit"); Thread.sleep(10000); } LOG.info("Already Submitted " + distBlockFixer.submittedJobs.size() + " jobs"); assertTrue("should submit more than 1 jobs", distBlockFixer.submittedJobs.size() - submittedJob >= 1); } finally { myTearDown(); } } public static DistributedFileSystem getDFS( Configuration conf, FileSystem dfs) throws IOException { Configuration clientConf = new Configuration(conf); clientConf.set("fs.hdfs.impl", "org.apache.hadoop.hdfs.DistributedFileSystem"); clientConf.setBoolean("fs.hdfs.impl.disable.cache", true); URI dfsUri = dfs.getUri(); FileSystem.closeAll(); return (DistributedFileSystem) FileSystem.get(dfsUri, clientConf); } private void mySetup(int stripeLength) throws Exception { if (System.getProperty("hadoop.log.dir") == null) { String base = new File(".").getAbsolutePath(); System.setProperty("hadoop.log.dir", new Path(base).toString() + "/logs"); } new File(TEST_DIR).mkdirs(); // Make sure data directory exists conf = new Configuration(); conf.set("raid.config.file", CONFIG_FILE); conf.setBoolean("raid.config.reload", true); conf.setLong("raid.config.reload.interval", RELOAD_INTERVAL); // scan all policies once every 5 second conf.setLong("raid.policy.rescan.interval", 5000); // do not use map-reduce cluster for Raiding conf.set("raid.classname", "org.apache.hadoop.raid.LocalRaidNode"); conf.set("raid.server.address", "localhost:" + MiniDFSCluster.getFreePort()); conf.set("mapred.raid.http.address", "localhost:0"); Utils.loadTestCodecs(conf, stripeLength, stripeLength, 1, 3, "/destraid", "/destraidrs", false, true); conf.setBoolean("dfs.permissions", false); // Make sure initial repl is smaller than NUM_DATANODES conf.setInt(RaidNode.RAID_PARITY_INITIAL_REPL_KEY, 1); dfsCluster = new MiniDFSCluster(conf, NUM_DATANODES, true, null); dfsCluster.waitActive(); fileSys = dfsCluster.getFileSystem(); namenode = fileSys.getUri().toString(); FileSystem.setDefaultUri(conf, namenode); mr = new MiniMRCluster(4, namenode, 3); jobTrackerName = "localhost:" + mr.getJobTrackerPort(); hftp = "hftp://localhost.localdomain:" + dfsCluster.getNameNodePort(); FileSystem.setDefaultUri(conf, namenode); conf.set("mapred.job.tracker", jobTrackerName); conf.set(RaidNode.RAID_CHECKSUM_STORE_CLASS_KEY, "org.apache.hadoop.raid.LocalChecksumStore"); conf.setBoolean(RaidNode.RAID_CHECKSUM_STORE_REQUIRED_KEY, true); conf.set(LocalChecksumStore.LOCAL_CHECK_STORE_DIR_KEY, CHECKSUM_STORE_DIR); conf.set(RaidNode.RAID_STRIPE_STORE_CLASS_KEY, "org.apache.hadoop.raid.LocalStripeStore"); conf.set(LocalStripeStore.LOCAL_STRIPE_STORE_DIR_KEY, STRIPE_STORE_DIR); ConfigBuilder cb = new ConfigBuilder(CONFIG_FILE); cb.addPolicy("RaidTest1", "/user/dhruba/raidtest", 1, 1); cb.addPolicy("RaidTest2", "/user/dhruba/raidtestrs", 1, 1, "rs"); cb.persist(); } private void myTearDown() throws Exception { if (cnode != null) { cnode.stop(); cnode.join(); } if (mr != null) { mr.shutdown(); } if (dfsCluster != null) { dfsCluster.shutdown(); } } static void corruptBlock(Block block, MiniDFSCluster dfs) throws IOException { boolean corrupted = false; for (int i = 0; i < NUM_DATANODES; i++) { corrupted |= TestDatanodeBlockScanner.corruptReplica(block, i, dfs); } assertTrue("could not corrupt block", corrupted); } }