/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs;
import java.io.*;
import java.util.Iterator;
import java.util.Random;
import java.net.*;
import junit.framework.TestCase;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.ClientProtocol;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.FSConstants;
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
import org.apache.hadoop.hdfs.protocol.FSConstants.DatanodeReportType;
import org.apache.hadoop.hdfs.server.datanode.BlockInlineChecksumWriter;
import org.apache.hadoop.hdfs.server.datanode.DataNode;
import org.apache.hadoop.hdfs.server.datanode.SimulatedFSDataset;
import org.apache.hadoop.hdfs.server.namenode.BlockPlacementPolicy;
import org.apache.hadoop.hdfs.server.namenode.BlockPlacementPolicyConfigurable;
import org.apache.hadoop.hdfs.server.namenode.BlockPlacementPolicyDefault;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.BlockLocation;
import static org.junit.Assert.*;
import org.junit.Test;
/**
* This class tests the replication of a DFS file.
*/
public class TestReplication {
private static final long seed = 0xDEADBEEFL;
private static final int blockSize = 8192;
private static final int fileSize = 16384;
private static final String racks[] = new String[] {
"/d1/r1", "/d1/r1", "/d1/r2", "/d1/r2", "/d1/r2", "/d2/r3", "/d2/r3",
"/d2/r3"
};
private static final int numDatanodes = racks.length;
private static final Log LOG = LogFactory.getLog(
"org.apache.hadoop.hdfs.TestReplication");
private static final String[] racks4 = new String[] { racks[0], racks[1], racks[2], racks[3] };
private void writeFile(FileSystem fileSys, Path name, int repl)
throws IOException {
// create and write a file that contains three blocks of data
FSDataOutputStream stm = fileSys.create(name, true,
fileSys.getConf().getInt("io.file.buffer.size", 4096),
(short)repl, (long)blockSize);
byte[] buffer = new byte[fileSize];
Random rand = new Random(seed);
rand.nextBytes(buffer);
stm.write(buffer);
stm.close();
}
/* check if there are at least two nodes are on the same rack */
private void checkFile(FileSystem fileSys, Path name, int repl)
throws IOException {
Configuration conf = fileSys.getConf();
ClientProtocol namenode = DFSClient.createNamenode(conf);
waitForBlockReplication(name.toString(), namenode,
Math.min(numDatanodes, repl), -1);
LocatedBlocks locations = namenode.getBlockLocations(name.toString(),0,
Long.MAX_VALUE);
FileStatus stat = fileSys.getFileStatus(name);
BlockLocation[] blockLocations = fileSys.getFileBlockLocations(stat,0L,
Long.MAX_VALUE);
// verify that rack locations match
assertTrue(blockLocations.length == locations.locatedBlockCount());
for (int i = 0; i < blockLocations.length; i++) {
LocatedBlock blk = locations.get(i);
DatanodeInfo[] datanodes = blk.getLocations();
String[] topologyPaths = blockLocations[i].getTopologyPaths();
String[] racks = blockLocations[i].getRacks();
assertTrue(topologyPaths.length == datanodes.length);
for (int j = 0; j < topologyPaths.length; j++) {
boolean found = false;
String matchedRack = null;
for (int k = 0; k < racks.length; k++) {
if (topologyPaths[j].startsWith(racks[k])) {
found = true;
matchedRack = racks[k];
break;
}
}
assertTrue(found);
assertEquals("Rack info should be equal", matchedRack, racks[j]);
}
}
boolean isOnSameRack = true, isNotOnSameRack = true;
for (LocatedBlock blk : locations.getLocatedBlocks()) {
DatanodeInfo[] datanodes = blk.getLocations();
if (datanodes.length <= 1) break;
if (datanodes.length == 2) {
isNotOnSameRack = !(datanodes[0].getNetworkLocation().equals(
datanodes[1].getNetworkLocation()));
break;
}
isOnSameRack = false;
isNotOnSameRack = false;
for (int i = 0; i < datanodes.length-1; i++) {
LOG.info("datanode "+ i + ": "+ datanodes[i].getName());
boolean onRack = false;
for( int j=i+1; j<datanodes.length; j++) {
if( datanodes[i].getNetworkLocation().equals(
datanodes[j].getNetworkLocation()) ) {
onRack = true;
}
}
if (onRack) {
isOnSameRack = true;
}
if (!onRack) {
isNotOnSameRack = true;
}
if (isOnSameRack && isNotOnSameRack) break;
}
if (!isOnSameRack || !isNotOnSameRack) break;
}
assertTrue(isOnSameRack);
if (conf.getClass("dfs.block.replicator.classname", null,
BlockPlacementPolicy.class).equals(
BlockPlacementPolicyConfigurable.class)
&& repl == 2) {
// For BlockPlacementPolicyConfigurable we do in rack replication for r =
// 2.
assertFalse(isNotOnSameRack);
} else {
assertTrue(isNotOnSameRack);
}
}
private void cleanupFile(FileSystem fileSys, Path name) throws IOException {
assertTrue(fileSys.exists(name));
fileSys.delete(name, true);
assertTrue(!fileSys.exists(name));
}
/*
* Test if Datanode reports bad blocks during replication request
*/
@Test
public void testBadBlockReportOnTransfer() throws Exception {
Configuration conf = new Configuration();
FileSystem fs = null;
DFSClient dfsClient = null;
LocatedBlocks blocks = null;
int replicaCount = 0;
MiniDFSCluster cluster = new MiniDFSCluster(conf, 2, true, null);
cluster.waitActive();
fs = cluster.getFileSystem();
dfsClient = new DFSClient(new InetSocketAddress("localhost",
cluster.getNameNodePort()), conf);
// Create file with replication factor of 1
Path file1 = new Path("/tmp/testBadBlockReportOnTransfer/file1");
DFSTestUtil.createFile(fs, file1, 1024, (short)1, 0);
DFSTestUtil.waitReplication(fs, file1, (short)1);
// Corrupt the block belonging to the created file
Block block = DFSTestUtil.getFirstBlock(fs, file1);
cluster.corruptBlockOnDataNodes(block);
// Increase replication factor, this should invoke transfer request
// Receiving datanode fails on checksum and reports it to namenode
fs.setReplication(file1, (short)2);
// Now get block details and check if the block is corrupt
blocks = dfsClient.namenode.
getBlockLocations(file1.toString(), 0, Long.MAX_VALUE);
while (blocks.get(0).isCorrupt() != true) {
try {
LOG.info("Waiting until block is marked as corrupt...");
Thread.sleep(1000);
} catch (InterruptedException ie) {
}
blocks = dfsClient.namenode.
getBlockLocations(file1.toString(), 0, Long.MAX_VALUE);
}
replicaCount = blocks.get(0).getLocations().length;
assertTrue(replicaCount == 1);
cluster.shutdown();
}
/**
* Tests replication in DFS.
*/
private void runReplication(boolean simulated,
Class<? extends BlockPlacementPolicy> clazz) throws IOException {
Configuration conf = new Configuration();
conf.setClass("dfs.block.replicator.classname", clazz,
BlockPlacementPolicy.class);
conf.setBoolean("dfs.replication.considerLoad", false);
if (simulated) {
conf.setBoolean(SimulatedFSDataset.CONFIG_PROPERTY_SIMULATED, true);
}
MiniDFSCluster cluster = null;
FileSystem fileSys = null;
DFSClient client = null;
try {
cluster = new MiniDFSCluster(conf, numDatanodes, racks, null,
true, true);
cluster.waitActive();
cluster.getNameNode().namesystem.refreshNodes(conf);
client = new DFSClient(cluster.getNameNode().getNameNodeAddress(), conf);
DatanodeInfo[] info = client.datanodeReport(DatanodeReportType.LIVE);
assertEquals("Number of Datanodes ", numDatanodes, info.length);
fileSys = cluster.getFileSystem();
Path file1 = new Path("/smallblocktest.dat");
writeFile(fileSys, file1, 3);
checkFile(fileSys, file1, 3);
cleanupFile(fileSys, file1);
writeFile(fileSys, file1, 10);
checkFile(fileSys, file1, 10);
cleanupFile(fileSys, file1);
writeFile(fileSys, file1, 4);
checkFile(fileSys, file1, 4);
cleanupFile(fileSys, file1);
writeFile(fileSys, file1, 1);
checkFile(fileSys, file1, 1);
cleanupFile(fileSys, file1);
writeFile(fileSys, file1, 2);
checkFile(fileSys, file1, 2);
cleanupFile(fileSys, file1);
} finally {
if(client != null) {
client.close();
}
if(fileSys != null)
fileSys.close();
if (cluster != null)
cluster.shutdown();
}
}
@Test
public void testReplicationSimulatedStoragDefault() throws IOException {
runReplication(true, BlockPlacementPolicyDefault.class);
}
@Test
public void testReplicationDefault() throws IOException {
runReplication(false, BlockPlacementPolicyDefault.class);
}
@Test
public void testReplicationSimulatedStoragConfigurable() throws IOException {
runReplication(true, BlockPlacementPolicyConfigurable.class);
}
@Test
public void testReplicationConfigurable() throws IOException {
runReplication(false, BlockPlacementPolicyConfigurable.class);
}
// Waits for all of the blocks to have expected replication
private void waitForBlockReplication(String filename,
ClientProtocol namenode,
int expected, long maxWaitSec)
throws IOException {
waitForBlockReplication(filename, namenode, expected, maxWaitSec, false);
}
// Waits for all of the blocks to have expected replication
private void waitForBlockReplication(String filename,
ClientProtocol namenode,
int expected, long maxWaitSec,
boolean isUnderConstruction)
throws IOException {
long start = System.currentTimeMillis();
//wait for all the blocks to be replicated;
LOG.info("Checking for block replication for " + filename);
while (true) {
boolean replOk = true;
LocatedBlocks blocks = namenode.getBlockLocations(filename, 0,
Long.MAX_VALUE);
for (Iterator<LocatedBlock> iter = blocks.getLocatedBlocks().iterator();
iter.hasNext();) {
LocatedBlock block = iter.next();
if (isUnderConstruction && !iter.hasNext()) {
break; // do not check the last block
}
int actual = block.getLocations().length;
if ( actual < expected ) {
LOG.info("Not enough replicas for " + block.getBlock() +
" yet. Expecting " + expected + ", got " +
actual + ".");
replOk = false;
break;
}
}
if (replOk) {
return;
}
if (maxWaitSec > 0 &&
(System.currentTimeMillis() - start) > (maxWaitSec * 1000)) {
throw new IOException("Timedout while waiting for all blocks to " +
" be replicated for " + filename);
}
try {
Thread.sleep(500);
} catch (InterruptedException ignored) {}
}
}
/*
* This test makes sure that NameNode retries all the available blocks for
* under replicated blocks.
*
* It creates a file with one block and replication of 4. It corrupts two of
* the blocks and removes one of the replicas. Expected behaviour is that
* missing replica will be copied from one valid source.
*/
@Test
public void testPendingReplicationRetryDefault() throws IOException {
runPendingReplicationRetry(BlockPlacementPolicyDefault.class);
}
@Test
public void testPendingReplicationRetryConfigurable() throws IOException {
runPendingReplicationRetry(BlockPlacementPolicyConfigurable.class);
}
/* This test makes sure that NameNode retries all the available blocks
* for under replicated blocks.
*
* It creates a file with one block and replication of 4. It corrupts
* two of the blocks and removes one of the replicas. Expected behaviour is
* that missing replica will be copied from one valid source.
*/
private void runPendingReplicationRetry(
Class<? extends BlockPlacementPolicy> clazz) throws IOException {
pendingReplicationRetryInternal(false, clazz);
}
@Test
public void testPendingReplicationRetryInlineChecksum() throws IOException {
pendingReplicationRetryInternal(true, BlockPlacementPolicyDefault.class);
}
private void pendingReplicationRetryInternal(boolean inlineChecksum,
Class<? extends BlockPlacementPolicy> clazz)
throws IOException {
MiniDFSCluster cluster = null;
int numDataNodes = 4;
String testFile = "/replication-test-file";
Path testPath = new Path(testFile);
byte buffer[] = new byte[1024];
for (int i=0; i<buffer.length; i++) {
buffer[i] = '1';
}
try {
Configuration conf = new Configuration();
conf.setClass("dfs.block.replicator.classname", clazz,
BlockPlacementPolicy.class);
conf.set("dfs.replication", Integer.toString(numDataNodes));
cluster = new MiniDFSCluster(conf, numDataNodes, racks4, null, true, true);
cluster.waitActive();
for (DataNode dn : cluster.getDataNodes()) {
dn.useInlineChecksum = inlineChecksum;
}
DFSClient dfsClient = new DFSClient(new InetSocketAddress("localhost",
cluster.getNameNodePort()),
conf);
OutputStream out = cluster.getFileSystem().create(testPath);
out.write(buffer);
out.close();
waitForBlockReplication(testFile, dfsClient.namenode, numDataNodes, -1);
// get first block of the file.
Block block = dfsClient.namenode.
getBlockLocations(testFile, 0, Long.MAX_VALUE).
get(0).getBlock();
File[] blockFiles = new File[6];
for (int i=0; i<6; i++) {
String fileName;
if (!inlineChecksum) {
fileName = block.getBlockName();
} else {
fileName = BlockInlineChecksumWriter.getInlineChecksumFileName(block,
FSConstants.CHECKSUM_TYPE, cluster.conf.getInt(
"io.bytes.per.checksum",
FSConstants.DEFAULT_BYTES_PER_CHECKSUM));
}
blockFiles[i] = new File(cluster.getBlockDirectory("data" + (i+1)), fileName);
}
cluster.shutdown();
cluster = null;
//Now mess up some of the replicas.
//Delete the first and corrupt the next two.
for (int i=0; i<25; i++) {
buffer[i] = '0';
}
int fileCount = 0;
for (int i=0; i<6; i++) {
File blockFile = blockFiles[i];
LOG.info("Checking for file " + blockFile);
if (blockFile.exists()) {
if (fileCount == 0) {
LOG.info("Deleting file " + blockFile);
assertTrue(blockFile.delete());
} else {
// corrupt it.
LOG.info("Corrupting file " + blockFile);
long len = blockFile.length();
assertTrue(len > 50);
RandomAccessFile blockOut = new RandomAccessFile(blockFile, "rw");
try {
blockOut.seek(len/3);
blockOut.write(buffer, 0, 25);
} finally {
blockOut.close();
}
}
fileCount++;
}
}
assertEquals(3, fileCount);
/* Start the MiniDFSCluster with more datanodes since once a writeBlock
* to a datanode node fails, same block can not be written to it
* immediately. In our case some replication attempts will fail.
*/
LOG.info("Restarting minicluster after deleting a replica and corrupting 2 crcs");
conf = new Configuration();
conf.setClass("dfs.block.replicator.classname", clazz,
BlockPlacementPolicy.class);
// first time format
conf.set("dfs.replication", Integer.toString(numDataNodes));
conf.set("dfs.replication.pending.timeout.sec", Integer.toString(2));
conf.set("dfs.datanode.block.write.timeout.sec", Integer.toString(5));
conf.set("dfs.safemode.threshold.pct", "0.75f"); // only 3 copies exist
conf.setBoolean("dfs.use.inline.checksum", !inlineChecksum);
cluster = new MiniDFSCluster(conf, numDataNodes * 2, racks, null, true,
true, false);
cluster.waitActive();
dfsClient = new DFSClient(new InetSocketAddress("localhost",
cluster.getNameNodePort()),
conf);
waitForBlockReplication(testFile, dfsClient.namenode, numDataNodes, -1);
} finally {
if (cluster != null) {
cluster.shutdown();
}
}
}
private void testReplicateLenMismatchedBlockInternal(boolean inlineChecksum)
throws Exception {
Configuration conf = new Configuration();
conf.setBoolean("dfs.use.inline.checksum", inlineChecksum);
MiniDFSCluster cluster = new MiniDFSCluster(conf, 2, true, null);
try {
cluster.waitActive();
// test truncated block
changeBlockLen(cluster, -1, inlineChecksum);
// test extended block
changeBlockLen(cluster, 1, inlineChecksum);
} finally {
cluster.shutdown();
}
}
/**
* Test if replication can detect mismatched length on-disk blocks
* @throws Exception
*/
@Test
public void testReplicateLenMismatchedBlock() throws Exception {
testReplicateLenMismatchedBlockInternal(false);
}
/**
* Test if replication can detect mismatched length on-disk blocks
* @throws Exception
*/
@Test
public void testReplicateLenMismatchedBlockInlineChecksum() throws Exception {
testReplicateLenMismatchedBlockInternal(true);
}
private void changeBlockLen(MiniDFSCluster cluster,
int lenDelta, boolean isInlineChecksum) throws IOException,
InterruptedException {
final Path fileName = new Path("/file1");
final short REPLICATION_FACTOR = (short)1;
final FileSystem fs = cluster.getFileSystem();
final int fileLen = fs.getConf().getInt("io.bytes.per.checksum", 512);
DFSTestUtil.createFile(fs, fileName, fileLen, REPLICATION_FACTOR, 0);
DFSTestUtil.waitReplication(fs, fileName, REPLICATION_FACTOR);
String block;
if (!isInlineChecksum) {
block = DFSTestUtil.getFirstBlock(fs, fileName).getBlockName();
} else {
block = BlockInlineChecksumWriter.getInlineChecksumFileName(DFSTestUtil
.getFirstBlock(fs, fileName), FSConstants.CHECKSUM_TYPE, cluster.conf
.getInt("io.bytes.per.checksum",
FSConstants.DEFAULT_BYTES_PER_CHECKSUM));
}
// Change the length of a replica
for (int i=0; i<cluster.getDataNodes().size(); i++) {
if (TestDatanodeBlockScanner.changeReplicaLength(block, i, lenDelta, cluster)) {
break;
}
}
// increase the file's replication factor
fs.setReplication(fileName, (short)(REPLICATION_FACTOR+1));
// block replication triggers corrupt block detection
DFSClient dfsClient = new DFSClient(new InetSocketAddress("localhost",
cluster.getNameNodePort()), fs.getConf());
LocatedBlocks blocks = dfsClient.namenode.getBlockLocations(
fileName.toString(), 0, fileLen);
if (lenDelta < 0) { // replica truncated
while (!blocks.get(0).isCorrupt() ||
REPLICATION_FACTOR != blocks.get(0).getLocations().length) {
Thread.sleep(100);
blocks = dfsClient.namenode.getBlockLocations(
fileName.toString(), 0, fileLen);
}
} else { // no corruption detected; block replicated
while (!blocks.get(0).isCorrupt() &&
REPLICATION_FACTOR +1 != blocks.get(0).getLocations().length) {
Thread.sleep(100);
blocks = dfsClient.namenode.getBlockLocations(
fileName.toString(), 0, fileLen);
}
LOG.info("Block is " +
(blocks.get(0).isCorrupt() ? "corrupted" : "healthy"));
LOG.info("Replication number: " + blocks.get(0).getLocations().length);
}
fs.delete(fileName, true);
}
/* This test makes sure that the blocks except for last one in an under
* construction file are replicated.
*
* It creates a file with one block and replication of 4. It corrupts
* two of the blocks and removes one of the replicas. Expected behaviour is
* that missing replica will be copied from one valid source.
*/
@Test
public void testBlockReplicationInUCF() throws IOException {
MiniDFSCluster cluster = null;
short numDataNodes = 3;
String testFile = "/replication-test-file";
Path testPath = new Path(testFile);
byte buffer[] = new byte[1024];
for (int i=0; i<buffer.length; i++) {
buffer[i] = '1';
}
try {
Configuration conf = new Configuration();
conf.set("dfs.replication", Integer.toString(numDataNodes-1));
conf.setLong("dfs.block.size", 1024L);
cluster = new MiniDFSCluster(0, conf, numDataNodes, true,
true, null, null);
cluster.waitActive();
DFSClient dfsClient = new DFSClient(new InetSocketAddress("localhost",
cluster.getNameNodePort()),
conf);
FileSystem fs = cluster.getFileSystem();
OutputStream out = fs.create(testPath);
out.write(buffer);
out.write(buffer);
waitForBlockReplication(testFile, dfsClient.namenode, numDataNodes-1, -1, true);
// bump this file's replication factor
fs.setReplication(testPath, numDataNodes);
waitForBlockReplication(testFile, dfsClient.namenode, numDataNodes, 300, true);
} finally {
if (cluster != null) {
cluster.shutdown();
}
}
}
/*
* Test if rate cap takes effective
*/
@Test
public void testRateCap() throws Exception {
Configuration conf = new Configuration();
conf.setLong("dfs.data.transfer.max.bytes.per.sec", 128 * 1024);
FileSystem fs = null;
MiniDFSCluster cluster = new MiniDFSCluster(conf, 2, true, null);
try {
cluster.waitActive();
fs = cluster.getFileSystem();
// Create file with replication factor of 1
Path file1 = new Path("/tmp/testRateCap");
DFSTestUtil.createFile(fs, file1, 512 * 1024, (short) 1, 0);
DFSTestUtil.waitReplication(fs, file1, (short) 1);
// Make sure replication doesn't finish too fast.
long startTime = System.currentTimeMillis();
fs.setReplication(file1, (short) 2);
DFSTestUtil.waitReplication(fs, file1, (short) 2);
long endTime = System.currentTimeMillis();
long length = endTime - startTime;
System.out.println("Taking " + length + " ms to replicate.");
TestCase.assertTrue(endTime - startTime > 3700);
} finally {
cluster.shutdown();
}
}
}