/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.raid;
import java.io.File;
import java.io.FileWriter;
import java.util.Map;
import java.util.Random;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileChecksum;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.hdfs.RaidDFSUtil;
import org.apache.hadoop.hdfs.TestRaidDfs;
import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
import org.apache.hadoop.mapred.MiniMRCluster;
import org.apache.hadoop.raid.LogUtils.LOGRESULTS;
import org.apache.hadoop.raid.LogUtils.LOGTYPES;
import org.apache.hadoop.util.StringUtils;
import junit.framework.TestCase;
public class TestSimulationBlockFixer extends TestCase {
final static Log LOG = LogFactory.getLog(
"org.apache.hadoop.raid.TestSimulationBlockFixer");
final static String TEST_DIR = new File(System.getProperty("test.build.data",
"build/contrib/raid/test/data")).getAbsolutePath();
final static String CONFIG_FILE = new File(TEST_DIR,
"test-raid.xml").getAbsolutePath();
final static long RELOAD_INTERVAL = 1000;
final static int NUM_DATANODES = 3;
Configuration conf;
String namenode = null;
MiniDFSCluster dfsCluster = null;
String hftp = null;
MiniMRCluster mr = null;
FileSystem fileSys = null;
RaidNode cnode = null;
String jobTrackerName = null;
Random rand = new Random();
static {
ParityFilePair.disableCacheUsedInTestOnly();
}
protected void mySetup(int stripeLength, int timeBeforeHar,
String xorCode, String rsCode, String code, boolean hasSimulation)
throws Exception {
if (System.getProperty("hadoop.log.dir") == null) {
String base = new File(".").getAbsolutePath();
System.setProperty("hadoop.log.dir", new Path(base).toString() + "/logs");
}
new File(TEST_DIR).mkdirs(); // Make sure data directory exists
conf = new Configuration();
conf.set("raid.config.file", CONFIG_FILE);
conf.setBoolean("raid.config.reload", true);
conf.setLong("raid.config.reload.interval", RELOAD_INTERVAL);
// scan all policies once every 5 second
conf.setLong("raid.policy.rescan.interval", 5000);
conf.set("mapred.raid.http.address", "localhost:0");
conf.setInt(RaidNode.RAID_PARITY_INITIAL_REPL_KEY, 1);
// do not use map-reduce cluster for Raiding
conf.set("raid.classname", "org.apache.hadoop.raid.LocalRaidNode");
conf.set("raid.server.address", "localhost:" + MiniDFSCluster.getFreePort());
conf.setLong("raid.blockfix.maxpendingjobs", 1L);
conf.setLong(BlockIntegrityMonitor.BLOCKCHECK_INTERVAL, 1000L);
conf.setLong(DistBlockIntegrityMonitor.RAIDNODE_BLOCK_FIX_SUBMISSION_INTERVAL_KEY,
15000L);
conf.setLong(DistBlockIntegrityMonitor.RAIDNODE_BLOCK_FIX_SCAN_SUBMISSION_INTERVAL_KEY,
3600000);
Utils.loadTestCodecs(conf, stripeLength, stripeLength, 1, 3, "/destraid",
"/destraidrs", hasSimulation, xorCode, rsCode,
false);
conf.setBoolean("dfs.permissions", false);
dfsCluster = new MiniDFSCluster(conf, NUM_DATANODES, true, null);
dfsCluster.waitActive();
fileSys = dfsCluster.getFileSystem();
namenode = fileSys.getUri().toString();
FileSystem.setDefaultUri(conf, namenode);
mr = new MiniMRCluster(4, namenode, 3);
jobTrackerName = "localhost:" + mr.getJobTrackerPort();
hftp = "hftp://localhost.localdomain:" + dfsCluster.getNameNodePort();
FileSystem.setDefaultUri(conf, namenode);
conf.set("mapred.job.tracker", jobTrackerName);
ConfigBuilder cb = new ConfigBuilder(CONFIG_FILE);
cb.addPolicy("RaidTest1", "/user/dhruba/raidtest", 1, 1, code);
cb.persist();
}
protected void myTearDown() throws Exception {
if (cnode != null) { cnode.stop(); cnode.join(); }
if (mr != null) { mr.shutdown(); }
if (dfsCluster != null) { dfsCluster.shutdown(); }
}
public void testBadBlockFixerWithoutChecksums() throws Exception {
implSimulationBlockFixer(false, "xor", true, false, true);
implSimulationBlockFixer(false, "rs", true, false, true);
}
public void testBadBlockFixerWithoutSimulation() throws Exception {
implSimulationBlockFixer(false, "xor", true, true, false);
implSimulationBlockFixer(false, "rs", true, true, false);
}
public void testGoodBlockFixer() throws Exception {
implSimulationBlockFixer(true, "xor", true);
implSimulationBlockFixer(true, "rs", true);
}
public void testBadBlockFixer() throws Exception {
implSimulationBlockFixer(false, "xor", true);
implSimulationBlockFixer(false, "rs", true);
}
public void implSimulationBlockFixer(boolean isGoodFixer, String code,
boolean fixSource) throws Exception {
implSimulationBlockFixer(isGoodFixer, code, fixSource, true, true);
}
public void implSimulationBlockFixer(boolean isGoodFixer, String code,
boolean fixSource, boolean hasChecksum, boolean hasSimulation)
throws Exception {
String testMessage = "TestSimulationBlockFixer started:" +
(isGoodFixer?"Good":"Bad") + " fixer," +
" Code "+ code + " " + (fixSource?"source":"parity") + " " +
(hasChecksum?"has-checksum":"") + " " +
(hasSimulation?"hasSimulation":"");
LOG.info("Test started :" + testMessage);
long blockSize = 8192L;
int stripeLength = 3;
if (isGoodFixer) {
mySetup(stripeLength, -1, "org.apache.hadoop.raid.XORCode",
"org.apache.hadoop.raid.ReedSolomonCode", code, hasSimulation);
} else {
mySetup(stripeLength, -1,
"org.apache.hadoop.raid.BadXORCode",
"org.apache.hadoop.raid.BadReedSolomonCode", code, hasSimulation);
}
Path file1 = new Path("/user/dhruba/raidtest/file1");
Path destPath;
if (code.equals("xor")) {
destPath = new Path("/destraid/user/dhruba/raidtest");
} else {
// equals ("rs")
destPath = new Path("/destraidrs/user/dhruba/raidtest");
}
long crc1 = TestRaidDfs.createTestFilePartialLastBlock(fileSys, file1,
1, 7, blockSize);
LOG.info("Test testBlockFix created test files");
FileStatus statFile1 = fileSys.getFileStatus(file1);
// create an instance of the RaidNode
Configuration localConf = new Configuration(conf);
localConf.set("raid.blockfix.classname",
"org.apache.hadoop.raid.DistBlockIntegrityMonitor");
localConf.setLong("raid.blockfix.filespertask", 2L);
// Add checksum store
TestBlockFixer.setChecksumStoreConfig(localConf);
Codec codec = Codec.getCodec(code);
try {
cnode = RaidNode.createRaidNode(null, localConf);
TestRaidDfs.waitForFileRaided(LOG, fileSys, file1, destPath);
cnode.stop(); cnode.join();
if (!hasChecksum) {
// Clear checksums
LocalChecksumStore lcs = new LocalChecksumStore();
lcs.initialize(localConf, false);
lcs.clear();
}
ParityFilePair pfPair = ParityFilePair.getParityFile(codec, statFile1,
localConf);
assertNotNull(pfPair);
Path parity = pfPair.getPath();
FileStatus stat;
DistributedFileSystem dfs = (DistributedFileSystem)fileSys;
String[] corruptFiles = DFSUtil.getCorruptFiles(dfs);
assertEquals("no corrupt files expected", 0, corruptFiles.length);
assertEquals("filesFixed() should return 0 before fixing files",
0, cnode.blockIntegrityMonitor.getNumFilesFixed());
Path corruptFile;
int [] corruptBlockIdxs;
if (fixSource) {
stat = fileSys.getFileStatus(file1);
LocatedBlocks locs = RaidDFSUtil.getBlockLocations(
dfs, file1.toUri().getPath(), 0, stat.getLen());
// Corrupt blocks in two different stripes. We can fix them.
corruptBlockIdxs = new int[]{1, 4, 6};
for (int idx: corruptBlockIdxs) {
TestBlockFixer.corruptBlock(locs.get(idx).getBlock(), dfsCluster);
}
RaidDFSUtil.reportCorruptBlocks(dfs, file1, corruptBlockIdxs, blockSize);
corruptFile = file1;
} else {
crc1 = RaidDFSUtil.getCRC(fileSys, parity);
stat = fileSys.getFileStatus(parity);
LocatedBlocks locs = RaidDFSUtil.getBlockLocations(
dfs, parity.toUri().getPath(), 0, stat.getLen());
corruptBlockIdxs = new int[] {0, 1, 2};
for (int idx : corruptBlockIdxs) {
TestBlockFixer.corruptBlock(locs.get(idx).getBlock(), dfsCluster);
}
RaidDFSUtil.reportCorruptBlocks(dfs, parity, corruptBlockIdxs, blockSize);
corruptFile = parity;
}
corruptFiles = DFSUtil.getCorruptFiles(dfs);
assertEquals("file not corrupted", 1, corruptFiles.length);
assertEquals("wrong file corrupted",
corruptFiles[0], corruptFile.toUri().getPath());
assertEquals("wrong number of corrupt blocks", 3,
RaidDFSUtil.corruptBlocksInFile(dfs, corruptFile.toUri().getPath(), 0,
stat.getLen()).size());
cnode = RaidNode.createRaidNode(null, localConf);
long start = System.currentTimeMillis();
while (cnode.blockIntegrityMonitor.getNumFilesFixed() < 1 &&
cnode.blockIntegrityMonitor.getNumFileFixFailures() < 1 &&
System.currentTimeMillis() - start < 120000) {
LOG.info("Test TestSimulationBlockFixer waiting for files to be fixed.");
Thread.sleep(1000);
}
if (hasSimulation || isGoodFixer) {
assertEquals("file not fixed", 1,
cnode.blockIntegrityMonitor.getNumFilesFixed());
TestBlockFixer.verifyMetrics(fileSys, cnode,
LOGTYPES.OFFLINE_RECONSTRUCTION_BLOCK,
LOGRESULTS.SUCCESS, 3L, true);
boolean fixed = TestRaidDfs.validateFile(dfs, corruptFile, stat.getLen(), crc1);
assertTrue("file not fixed: " + corruptFile.toString(), fixed);
// Verify the counters are right
long expectedNumFailures = isGoodFixer? 0: corruptBlockIdxs.length;
assertEquals(expectedNumFailures,
cnode.blockIntegrityMonitor.getNumBlockFixSimulationFailures());
assertEquals(3 - expectedNumFailures,
cnode.blockIntegrityMonitor.getNumBlockFixSimulationSuccess());
if (!hasChecksum) {
TestBlockFixer.verifyMetrics(fileSys, cnode,
LOGTYPES.OFFLINE_RECONSTRUCTION_GET_CHECKSUM,
LOGRESULTS.FAILURE, expectedNumFailures, true);
}
TestBlockFixer.verifyMetrics(fileSys, cnode,
LOGTYPES.OFFLINE_RECONSTRUCTION_CHECKSUM_VERIFICATION,
LOGRESULTS.FAILURE, 0L, false);
TestBlockFixer.verifyMetrics(fileSys, cnode,
LOGTYPES.OFFLINE_RECONSTRUCTION_SIMULATION,
LOGRESULTS.FAILURE, expectedNumFailures, true);
long expectedNumFailedJobs = isGoodFixer? 0: 1;
assertEquals("Number of simulated failed jobs should be " +
String.valueOf(expectedNumFailedJobs) +
" file: " + corruptFile, expectedNumFailedJobs,
((DistBlockIntegrityMonitor.Worker)cnode.
blockIntegrityMonitor.getCorruptionMonitor()).simFailJobIndex.
size());
} else {
assertEquals(0L, cnode.blockIntegrityMonitor.
getNumBlockFixSimulationFailures());
assertEquals(0L, cnode.blockIntegrityMonitor.
getNumBlockFixSimulationSuccess());
TestBlockFixer.verifyMetrics(fileSys, cnode,
LOGTYPES.OFFLINE_RECONSTRUCTION_SIMULATION,
LOGRESULTS.FAILURE, 0L, false);
assertEquals(1L, cnode.blockIntegrityMonitor.getNumFileFixFailures());
if (hasChecksum) {
// One mismatch checksum will cancel the whole file reconstruction
TestBlockFixer.verifyMetrics(fileSys, cnode,
LOGTYPES.OFFLINE_RECONSTRUCTION_CHECKSUM_VERIFICATION,
LOGRESULTS.FAILURE, 1L, true);
} else {
// all checksums are lost
TestBlockFixer.verifyMetrics(fileSys, cnode,
LOGTYPES.OFFLINE_RECONSTRUCTION_GET_CHECKSUM,
LOGRESULTS.FAILURE, 3L, true);
}
}
LOG.info(
((DistBlockIntegrityMonitor.Worker)cnode.
blockIntegrityMonitor.getCorruptionMonitor()).getStatus().toHtml(500));
} catch (Exception e) {
LOG.info("Test TestSimulationBlockFixer Exception " + e, e);
throw e;
} finally {
myTearDown();
}
LOG.info("Test completed: " + testMessage);
}
}