/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.raid; import java.util.Map; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hdfs.DFSUtil; import org.apache.hadoop.hdfs.DistributedFileSystem; import org.apache.hadoop.hdfs.RaidDFSUtil; import org.apache.hadoop.hdfs.TestRaidDfs; import org.apache.hadoop.hdfs.protocol.LocatedBlocks; import org.apache.hadoop.hdfs.protocol.Block; import org.apache.hadoop.raid.DistBlockIntegrityMonitor.CorruptFileStatus; import org.apache.hadoop.util.StringUtils; import org.junit.Test; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.util.Random; import junit.framework.TestCase; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.hdfs.MiniDFSCluster; import org.apache.hadoop.hdfs.TestDatanodeBlockScanner; import org.apache.hadoop.mapred.MiniMRCluster; public class TestFileCorruptions extends TestCase { final static Log LOG = LogFactory.getLog( "org.apache.hadoop.raid.TestFileCorruptions"); final static String TEST_DIR = new File(System.getProperty("test.build.data", "build/contrib/raid/test/data")).getAbsolutePath(); final static String CONFIG_FILE = new File(TEST_DIR, "test-raid.xml").getAbsolutePath(); final static long RELOAD_INTERVAL = 1000; final static int NUM_DATANODES = 3; Configuration conf; String namenode = null; MiniDFSCluster dfsCluster = null; String hftp = null; MiniMRCluster mr = null; FileSystem fileSys = null; RaidNode cnode = null; String jobTrackerName = null; Random rand = new Random(); static { ParityFilePair.disableCacheUsedInTestOnly(); } /** * tests that the distributed block fixer obeys * the limit on how many jobs to submit simultaneously. */ @Test public void testCorruptFileCounter() throws Exception { LOG.info("Test testCorruptFileCounter started."); long blockSize = 8192L; int stripeLength = 3; mySetup(stripeLength, -1); // never har Path file1 = new Path("/user/dhruba/raidtest/file1"); Path file2 = new Path("/user/dhruba/raidtest/file2"); Path destPath = new Path("/destraid/user/dhruba/raidtest"); long crc1 = TestRaidDfs.createTestFilePartialLastBlock(fileSys, file1, 1, 20, blockSize); long crc2 = TestRaidDfs.createTestFilePartialLastBlock(fileSys, file2, 1, 20, blockSize); LOG.info("Test testCorruptFileCounter created test files"); // create an instance of the RaidNode Configuration localConf = new Configuration(conf); localConf.setInt("raid.blockfix.interval", 1000); localConf.set("raid.blockfix.classname", "org.apache.hadoop.raid.DistBlockIntegrityMonitor"); localConf.setLong("raid.blockfix.filespertask", 2L); localConf.setLong("raid.blockfix.maxpendingjobs", 1L); localConf.set("raid.corruptfile.counter.dirs", "/user/dhruba/raidtest,/user/dhruba1"); localConf.setInt("raid.corruptfilecount.interval", 1000); localConf.set("mapred.raid.http.address", "localhost:0"); localConf.setInt( DistBlockIntegrityMonitor.RAIDNODE_MAX_NUM_DETECTION_TIME_COLLECTED_KEY, 1); try { cnode = RaidNode.createRaidNode(null, localConf); TestRaidDfs.waitForFileRaided(LOG, fileSys, file1, destPath); TestRaidDfs.waitForFileRaided(LOG, fileSys, file2, destPath); cnode.stop(); cnode.join(); FileStatus file1Stat = fileSys.getFileStatus(file1); FileStatus file2Stat = fileSys.getFileStatus(file2); DistributedFileSystem dfs = (DistributedFileSystem)fileSys; LocatedBlocks file1Loc = RaidDFSUtil.getBlockLocations(dfs, file1.toUri().getPath(), 0, file1Stat.getLen()); LocatedBlocks file2Loc = RaidDFSUtil.getBlockLocations(dfs, file2.toUri().getPath(), 0, file2Stat.getLen()); String[] corruptFiles = DFSUtil.getCorruptFiles(dfs); assertEquals("no corrupt files expected", 0, corruptFiles.length); assertEquals("filesFixed() should return 0 before fixing files", 0, cnode.blockIntegrityMonitor.getNumFilesFixed()); // corrupt file1 int[] corruptBlockIdxs = new int[]{0, 1, 2, 3, 4, 6}; for (int idx: corruptBlockIdxs) corruptBlock(file1Loc.get(idx).getBlock(), dfsCluster); RaidDFSUtil.reportCorruptBlocks(dfs, file1, corruptBlockIdxs, blockSize); cnode = RaidNode.createRaidNode(null, localConf); long startTime = System.currentTimeMillis(); Map<String, Map<CorruptFileStatus, Long>> result = null; while (System.currentTimeMillis() - startTime < 120000) { result = cnode.getCorruptFilesCounterMap(); Long counter = result.get("/user/dhruba/raidtest").get( CorruptFileStatus.RAID_UNRECOVERABLE); if (counter != null && counter > 0) { break; } LOG.info("Waiting for 1 corrupt file"); Thread.sleep(1000); } assertEquals("We expect 1 corrupt files", result.get( "/user/dhruba/raidtest").get(CorruptFileStatus.RAID_UNRECOVERABLE) , new Long(1L)); assertEquals("We expect 0 corrupt files", result.get( "/user/dhruba1").get(CorruptFileStatus.RAID_UNRECOVERABLE), new Long(0L)); // corrupt file2 for (int idx: corruptBlockIdxs) corruptBlock(file2Loc.get(idx).getBlock(), dfsCluster); RaidDFSUtil.reportCorruptBlocks(dfs, file2, corruptBlockIdxs, blockSize); startTime = System.currentTimeMillis(); while (System.currentTimeMillis() - startTime < 120000) { result = cnode.getCorruptFilesCounterMap(); Long counter = result.get("/user/dhruba/raidtest").get( CorruptFileStatus.RAID_UNRECOVERABLE); if (counter != null && counter > 1) { break; } LOG.info("Waiting for 2 corrupt files"); Thread.sleep(1000); } LOG.info("Handle " + cnode.getNumDetectionsPerSec() + " files per second"); assertTrue(cnode.getNumDetectionsPerSec() > 0); assertEquals("We expect 2 corrupt files", result.get( "/user/dhruba/raidtest").get(CorruptFileStatus.RAID_UNRECOVERABLE), new Long(2L)); assertEquals("We expect 0 corrupt files", result.get( "/user/dhruba1").get(CorruptFileStatus.RAID_UNRECOVERABLE), new Long(0L)); } catch (Exception e) { LOG.info("Test testCorruptFileCounter exception " + e.getMessage(), e); throw e; } finally { myTearDown(); } } private void mySetup(int stripeLength, int timeBeforeHar) throws Exception { new File(TEST_DIR).mkdirs(); // Make sure data directory exists conf = new Configuration(); conf.set("raid.config.file", CONFIG_FILE); conf.setBoolean("raid.config.reload", true); conf.setLong("raid.config.reload.interval", RELOAD_INTERVAL); // scan all policies once every 5 second conf.setLong("raid.policy.rescan.interval", 5000); // do not use map-reduce cluster for Raiding conf.set("raid.classname", "org.apache.hadoop.raid.LocalRaidNode"); conf.set("raid.server.address", "localhost:0"); conf.set("mapred.raid.http.address", "localhost:0"); conf.setInt(BlockIntegrityMonitor.BLOCKCHECK_INTERVAL, 3000); conf.setBoolean("dfs.permissions", false); Utils.loadTestCodecs(conf, 5, 1, 3, "/destraid", "/destraidrs"); dfsCluster = new MiniDFSCluster(conf, NUM_DATANODES, true, null); dfsCluster.waitActive(); fileSys = dfsCluster.getFileSystem(); namenode = fileSys.getUri().toString(); FileSystem.setDefaultUri(conf, namenode); mr = new MiniMRCluster(4, namenode, 3); jobTrackerName = "localhost:" + mr.getJobTrackerPort(); hftp = "hftp://localhost.localdomain:" + dfsCluster.getNameNodePort(); FileSystem.setDefaultUri(conf, namenode); conf.set("mapred.job.tracker", jobTrackerName); FileWriter fileWriter = new FileWriter(CONFIG_FILE); fileWriter.write("<?xml version=\"1.0\"?>\n"); String str = "<configuration> " + "<policy name = \"RaidTest1\"> " + "<srcPath prefix=\"/user/dhruba/raidtest\"/> " + "<codecId>xor</codecId> " + "<destPath> /destraid</destPath> " + "<property> " + "<name>targetReplication</name> " + "<value>1</value> " + "<description>after RAIDing, decrease the replication factor of a file to this value." + "</description> " + "</property> " + "<property> " + "<name>metaReplication</name> " + "<value>1</value> " + "<description> replication factor of parity file" + "</description> " + "</property> " + "<property> " + "<name>modTimePeriod</name> " + "<value>2000</value> " + "<description> time (milliseconds) after a file is modified to make it " + "a candidate for RAIDing " + "</description> " + "</property> "; if (timeBeforeHar >= 0) { str += "<property> " + "<name>time_before_har</name> " + "<value>" + timeBeforeHar + "</value> " + "<description> amount of time waited before har'ing parity files" + "</description> " + "</property> "; } str += "</policy>" + "</configuration>"; fileWriter.write(str); fileWriter.close(); } private void myTearDown() throws Exception { if (cnode != null) { cnode.stop(); cnode.join(); } if (mr != null) { mr.shutdown(); } if (dfsCluster != null) { dfsCluster.shutdown(); } } static void corruptBlock(Block block, MiniDFSCluster dfs) throws IOException { boolean corrupted = false; for (int i = 0; i < NUM_DATANODES; i++) { corrupted |= TestDatanodeBlockScanner.corruptReplica(block, i, dfs); } assertTrue("could not corrupt block", corrupted); } }