package org.apache.hadoop.hdfs.server.namenode;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.commons.logging.impl.Log4JLogger;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.HdfsConfiguration;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.hdfs.MiniDFSNNTopology;
import org.apache.log4j.Level;
import org.junit.Ignore;
import org.junit.Test;
import java.io.IOException;
/* We have no control over the order in which the datanodes and clients communicate with the namenode.
* Consider a scenario where the client is writing to a file.
* All the blocks have reached minimum replication level.
* The datanode have not yet reported all the replicas.
* The namenode will close the file and mark the blocks that have not yet been fully replicated as under replicated.
* Replication monitor starts the replication process for the under-replicated blocks.
*
* In this test number of datanode is equal to replication level of the file. The replication monitor
* tris to create new replicas on datanode that already have a copy of the replica (not yet reported to datanode). This results
* in ReplicaAlreadyExists Exception.
*
* The default placement policy also throws warnings like Not able to place enough replicas
*
* if too many threads are created then sometimes the data length is not consistent. In such case look for java.io.IOException: Too many open files exception in the log
*
*/
@Ignore(value = "The design of this test needs to be reconsidered. " +
"It fails most of the times because of race conditions.")
public class TestHAFileCreation extends junit.framework.TestCase {
public static final Log LOG = LogFactory.getLog(TestHAFileCreation.class);
{
((Log4JLogger) NameNode.stateChangeLog).getLogger().setLevel(Level.ALL);
((Log4JLogger) LeaseManager.LOG).getLogger().setLevel(Level.ALL);
((Log4JLogger) LogFactory.getLog(FSNamesystem.class)).getLogger()
.setLevel(Level.ALL);
}
Configuration conf = new HdfsConfiguration();
MiniDFSCluster cluster = null;
FileSystem fs = null;
int NN1 = 0, NN2 = 1;
static int NUM_NAMENODES = 2;
static int NUM_DATANODES = 3;
// 10 seconds timeout default
long NNDeathTimeout = 10000;
boolean writeInSameDir = true;
boolean killNN = true;
boolean waitFileisClosed = true;
int fileCloseWaitTime = 5000;
int waitReplicationTimeout = 5 * 60 * 1000;
Path baseDir = new Path("/testsLoad");
Writer[] writers = new Writer[50];
private void setupCluster(int replicationFactor) throws IOException {
// initialize the cluster with minimum 2 namenodes and minimum 6 datanodes
if (NUM_NAMENODES < 2) {
NUM_NAMENODES = 2;
}
if (replicationFactor > NUM_DATANODES) {
NUM_DATANODES = replicationFactor;
}
this.conf = new Configuration();
conf.setInt(DFSConfigKeys.DFS_REPLICATION_KEY, replicationFactor);
conf.setInt(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, 1024);
conf.setLong(DFSConfigKeys.DFS_NAMENODE_REPLICATION_PENDING_TIMEOUT_SEC_KEY,
15);
cluster = new MiniDFSCluster.Builder(conf)
.nnTopology(MiniDFSNNTopology.simpleHOPSTopology(NUM_NAMENODES))
.format(true).numDataNodes(NUM_DATANODES).build();
cluster.waitActive();
LOG.debug(
"NN1 address is " + cluster.getNameNode(NN1).getNameNodeAddress() +
" ld: " + cluster.getNameNode(NN1).isLeader() + " NN2 address is " +
cluster.getNameNode(NN2).getNameNodeAddress() + " ld: " +
cluster.getNameNode(NN2).isLeader());
fs = cluster.getNewFileSystemInstance(NN1);
NNDeathTimeout =
conf.getInt(DFSConfigKeys.DFS_LEADER_CHECK_INTERVAL_IN_MS_KEY,
DFSConfigKeys.DFS_LEADER_CHECK_INTERVAL_IN_MS_DEFAULT) * (conf
.getInt(DFSConfigKeys.DFS_LEADER_MISSED_HB_THRESHOLD_KEY,
DFSConfigKeys.DFS_LEADER_MISSED_HB_THRESHOLD_DEFAULT) + 2);
// create the directory namespace
assertTrue(fs.mkdirs(baseDir));
// create writers
for (int i = 0; i < writers.length; i++) {
writers[i] =
new Writer(fs, new String("file" + i), writeInSameDir, baseDir,
waitFileisClosed, fileCloseWaitTime);
}
}
private void shutdown() {
if (cluster != null) {
cluster.shutdown();
}
}
/**
* Under load perform failover by killing leader NN1 NN2 will be active and
* loads are now processed by NN2 Load should still continue No corrupt
* blocks should be reported
*/
@Test
public void testFailoverWhenLeaderNNCrashesTest1() {
// Testing with replication factor of 3
short repFactor = 3;
LOG.info(
"Running test [testFailoverWhenLeaderNNCrashes()] with replication factor " +
repFactor);
failoverWhenLeaderNNCrashes(repFactor);
}
@Test
public void testFailoverWhenLeaderNNCrashesTest2() {
// Testing with replication factor of 3
short repFactor = 6;
LOG.info(
"Running test [testFailoverWhenLeaderNNCrashes()] with replication factor " +
repFactor);
failoverWhenLeaderNNCrashes(repFactor);
}
private void failoverWhenLeaderNNCrashes(short replicationFactor) {
try {
// setup the cluster with required replication factor
setupCluster(replicationFactor);
// save leader namenode port to restart with the same port
int nnport = cluster.getNameNodePort(NN1);
try {
// writers start writing to their files
Writer.startWriters(writers);
// Give all the threads a chance to create their files and write something to it
Thread.sleep(10000);
LOG.debug("TestNN about to shutdown the namenode with address " +
cluster.getNameNode(NN1).getNameNodeAddress());
// kill leader NN1
if (killNN) {
cluster.shutdownNameNode(NN1);
LOG.debug("TestNN KILLED Namenode with address ");
TestHABasicFailover.waitLeaderElection(cluster.getDataNodes(),
cluster.getNameNode(NN2), NNDeathTimeout);
// Check NN2 is the leader and failover is detected
assertTrue("TestNN NN2 is expected to be the leader, but is not",
cluster.getNameNode(NN2).isLeader());
assertTrue("TestNN Not all datanodes detected the new leader",
TestHABasicFailover
.doesDataNodesRecognizeLeader(cluster.getDataNodes(),
cluster.getNameNode(NN2)));
}
// the load should still continue without any IO Exception thrown
LOG.info("TestNN Wait a few seconds. Let them write some more");
Thread.sleep(10000);
} finally {
Writer.stopWriters(writers);
}
LOG.debug("TestNN All File Should Have been closed");
Writer.verifyFile(writers, fs);
// the block report intervals would inform the namenode of under replicated blocks
// hflush() and close() would guarantee replication at all datanodes. This is a confirmation
Writer.waitReplication(fs, writers, replicationFactor,
waitReplicationTimeout);
if (true) {
return;
}
// restart the cluster without formatting using same ports and same configurations
cluster.shutdown();
cluster =
new MiniDFSCluster.Builder(conf).nameNodePort(nnport).format(false)
.nnTopology(MiniDFSNNTopology.simpleHOPSTopology(NUM_NAMENODES))
.numDataNodes(NUM_DATANODES).build();
cluster.waitActive();
// update the client so that it has the fresh list of namenodes. Black listed namenodes will be removed
fs = cluster.getNewFileSystemInstance(NN1);
Writer.verifyFile(writers,
fs); // throws IOException. Should be caught by parent
} catch (Exception ex) {
LOG.error("Received exception: " + ex.getMessage(), ex);
ex.printStackTrace();
fail("Exception: " + ex.getMessage());
} finally {
shutdown();
}
}
}