/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.raid; import java.io.IOException; import java.util.ArrayList; import java.util.Random; import java.util.concurrent.Callable; import java.util.concurrent.atomic.AtomicLong; import java.util.zip.CRC32; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hdfs.DistributedFileSystem; import org.apache.hadoop.hdfs.protocol.LocatedBlock; import org.apache.hadoop.hdfs.protocol.LocatedBlocks; import org.apache.hadoop.raid.DistBlockIntegrityMonitor.Priority; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.raid.protocol.PolicyInfo; import java.util.HashSet; import java.util.Set; /* * In Smoke Test Runnable, raidnode will try to raid a small file * and submit a small job to fix this file to verify blockfixing is working. * If not, it will shutdown the raidnode */ public class SmokeTestThread implements Callable<Boolean> { public static final Log LOG = LogFactory.getLog(SmokeTestThread.class); public static final String TEST_CODEC = "rs"; public static final long NUM_SOURCE_BLOCKS = 3; public static final long BLOCK_SIZE = 512L; public static final long SLEEP_TIME = 3000L; public static final String SMOKE_TEST_TIMEOUT_KEY = "raid.smoke.test.timeout"; public static final long DEFAULT_SMOKE_TEST_TIME_OUT = 120000L; public long timeOut = DEFAULT_SMOKE_TEST_TIME_OUT; public IOException ioe = null; public FileSystem fileSys = null; public String testFileDirectory = "/test/"; public String testFileBase = testFileDirectory + "smoketest"; public Random rand = new Random(); public DistRaidNode distRaidNode = null; public CRC32 checksum = new CRC32(); public SmokeTestThread(RaidNode rn) { distRaidNode = (DistRaidNode)rn; timeOut = distRaidNode.getConf().getLong(SMOKE_TEST_TIMEOUT_KEY, DEFAULT_SMOKE_TEST_TIME_OUT); } @Override public Boolean call() throws Exception { Path testPath = null; try { fileSys = FileSystem.get(distRaidNode.getConf()); // Create a small file with 3 blocks String testFile = testFileBase + rand.nextLong(); testPath = new Path(testFile); if (fileSys.exists(testPath)) { fileSys.delete(testPath, true); } long blockSize = BLOCK_SIZE; FSDataOutputStream stm = fileSys.create(testPath, true, fileSys.getConf().getInt("io.file.buffer.size", 4096), (short)3, blockSize); // Write 3 blocks. byte[] b = new byte[(int)blockSize]; for (int i = 0; i < NUM_SOURCE_BLOCKS; i++) { rand.nextBytes(b); stm.write(b); checksum.update(b); } stm.close(); LOG.info("[SMOKETEST] Created a test file: " + testFile + " with CRC32 checksum " + checksum.getValue()); PolicyInfo info = new PolicyInfo(testFile, distRaidNode.getConf()); info.setCodecId(TEST_CODEC); info.setSrcPath(testFileDirectory); info.setShouldRaid(true); info.setProperty("modTimePeriod", "0"); info.setProperty("targetReplication", "1"); info.setProperty("metaReplication", "1"); FileStatus stat = fileSys.getFileStatus(testPath); ArrayList<FileStatus> fstats = new ArrayList<FileStatus>(); fstats.add(stat); // Raid it using rs DistRaid dr = DistRaidNode.raidFiles(distRaidNode.getConf(), distRaidNode.jobMonitor, fstats, info); LOG.info("[SMOKETEST] RS Raid test file: " + testFile); if (dr == null) { throw new IOException("Failed to sart a raiding job"); } long startTime = System.currentTimeMillis(); while (!dr.checkComplete() && System.currentTimeMillis() - startTime < timeOut) { Thread.sleep(SLEEP_TIME); } if (!dr.checkComplete()) { throw new IOException("Failed to finish the raiding job in " + (timeOut/1000) + " seconds"); } if (!dr.successful()) { throw new IOException("Failed to raid the file " + testFile); } LOG.info("[SMOKETEST] Finish raiding test file: " + testFile); // Verify parity file exists Codec codec = Codec.getCodec(TEST_CODEC); Path parityPath = new Path(codec.getParityPrefix(), RaidNode.makeRelative(testPath)); FileStatus parityStat = fileSys.getFileStatus(parityPath); long numParityBlocks = RaidNode.numBlocks(parityStat); long expectedNumParityBlocks = RaidNode.numStripes(NUM_SOURCE_BLOCKS, codec.stripeLength) * codec.parityLength; if (numParityBlocks != expectedNumParityBlocks || parityStat.getLen() != expectedNumParityBlocks * BLOCK_SIZE) { throw new IOException("[SMOKETEST] Parity file " + parityPath + " has " + numParityBlocks + " blocks and " + parityStat.getLen() + " bytes, but we expect " + expectedNumParityBlocks + " blocks and " + (expectedNumParityBlocks * BLOCK_SIZE) + " bytes"); } LOG.info("[SMOKETEST] Verification of parity file " + parityPath + " succeeded"); LocatedBlock[] blocks = new LocatedBlock[1]; LocatedBlocks lbs = ((DistributedFileSystem)fileSys).getLocatedBlocks(testPath, 0, Integer.MAX_VALUE); // Corrupt the first block blocks[0] = lbs.get(0); ((DistributedFileSystem)fileSys).getClient().reportBadBlocks(blocks); LOG.info("[SMOKETEST] Finish corrupting the first block " + lbs.get(0).getBlock()); // submit a job to "fix" it Set<String> jobFiles = new HashSet<String>(); jobFiles.add(testFile); Job job = DistBlockIntegrityMonitor.startOneJob( (DistBlockIntegrityMonitor.Worker) distRaidNode.blockIntegrityMonitor.getCorruptionMonitor(), Priority.HIGH, jobFiles, System.currentTimeMillis(), new AtomicLong(0), new AtomicLong(System.currentTimeMillis()), Integer.MAX_VALUE); startTime = System.currentTimeMillis(); while (!job.isComplete() && System.currentTimeMillis() - startTime < timeOut) { Thread.sleep(SLEEP_TIME); } if (!job.isComplete()) { throw new IOException("Failed to finish the blockfixing job in " + (timeOut/1000) + " seconds"); } if (!job.isSuccessful()) { throw new IOException("Failed to fix the file " + testFile); } LOG.info("[SMOKETEST] Finish blockfixing test file: " + testFile); // wait for block is reported startTime = System.currentTimeMillis(); while (((DistributedFileSystem)fileSys).getLocatedBlocks(testPath, 0, Integer.MAX_VALUE).get(0).isCorrupt() && System.currentTimeMillis() - startTime < timeOut) { Thread.sleep(SLEEP_TIME); } CRC32 newChk = new CRC32(); FSDataInputStream readStm = fileSys.open(testPath); int num = 0; while (num >= 0) { num = readStm.read(b); if (num < 0) { break; } newChk.update(b, 0, num); } stm.close(); if (newChk.getValue() != checksum.getValue()) { throw new IOException("Fixed file's checksum " + newChk.getValue() + " != original one " + checksum.getValue()); } LOG.info("[SMOKETEST] Verification of fixed test file: " + testFile); return true; } catch (IOException ex) { LOG.error("Get IOException in SmokeTestThread", ex); ioe = ex; return false; } catch (Throwable ex) { LOG.error("Get Error in SmokeTestThread", ex); ioe = new IOException(ex); return false; } finally { try { if (fileSys != null) { fileSys.delete(testPath, true); } } catch (IOException ioe) { LOG.error("Get error during deletion", ioe); } } } }