/*
* Copyright (C) 2015 hops.io.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.hops.leaderElection.experiments;
import io.hops.exception.StorageException;
import io.hops.exception.StorageInitializtionException;
import io.hops.leaderElection.HdfsLeDescriptorFactory;
import io.hops.leaderElection.VarsRegister;
import io.hops.metadata.LEStorageFactory;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics;
import org.apache.hadoop.conf.Configuration;
import org.apache.log4j.Level;
import org.apache.log4j.LogManager;
import org.kohsuke.args4j.CmdLineException;
import org.kohsuke.args4j.CmdLineParser;
import org.kohsuke.args4j.Option;
import org.kohsuke.args4j.spi.OptionHandler;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Random;
import java.util.logging.Logger;
public class Experiment1 {
private static final Log LOG = LogFactory.getLog(Experiment1.class);
Configuration conf = null;
List<LightWeightNameNode> nnList;
@Option(name = "-time_period", usage = "Time Period")
private int time_period = 2 * 1000;
@Option(name = "-time_period_increment",
usage = "Time period increment. stabilization factor")
private long time_period_increment = 200;
@Option(name = "-missed_hb_threshold", usage = "Missed HB Threshold")
private int missed_hb_threshold = 2;
@Option(name = "-ndb_jar", usage = "NDB Implementation Driver JAR Path")
private String driver_jar =
"/home/salman/NetbeanProjects/hop/hops-metadata-dal-impl-ndb/target/hops-metadata-dal-impl-ndb-1.0-SNAPSHOT-jar-with-dependencies.jar";
@Option(name = "-max_processes", usage = "Max number of processes")
private int max_processes = 20;
@Option(name = "-process_join_wait_time",
usage = "Process join wait time. 0 for no wait, -1 for random wait between [0, time_period), and > 1 for fixed wait")
private int process_join_wait_time = -1;
@Option(name = "-number_of_leaders_to_kill",
usage = "Number of Leaders to kill")
private int number_of_leaders_to_kill = 10;
@Option(name = "-consider_stable_after",
usage = "If the time_period does not change for this long then the system is considered to be stable")
private long consider_stable_after = 10 * 1 * 1000;
@Option(name = "-max_stabilization_wait_time",
usage = "Maximum wait time to see if the system has stabilized. it should be > consider_stable_after")
private long max_stabilization_wait_time = 1 * 60 * 1000;
@Option(name = "-output_file_path", usage = "Output File")
private String output_file_path = "results.txt";
private final String HTTP_ADDRESS = "dummy.address.com:9999";
private final String RPC_ADDRESS = "repc.server.ip:0000";
private final String DRIVER_CLASS = "io.hops.metadata.ndb.NdbStorageFactory";
private final String DFS_STORAGE_DRIVER_CONFIG_FILE = "ndb-config.properties";
//private final List<Long> times = new ArrayList<Long>();
private final DescriptiveStatistics stats = new DescriptiveStatistics();
private long stable_time_period;
public static void main(String[] argv) throws Exception {
Experiment1 exp = new Experiment1();
exp.runExperiment(argv);
System.exit(0);
}
public void runExperiment(String[] args)
throws StorageInitializtionException, StorageException, IOException,
ClassNotFoundException, InterruptedException {
CmdLineParser parser = new CmdLineParser(this);
parser.setUsageWidth(80);
try {
// parse the arguments.
parser.parseArgument(args);
OptionHandler hd;
} catch (CmdLineException e) {
System.err.println(e.getMessage());
parser.printUsage(System.err);
System.err.println();
return;
}
init();
writeStartMessages(args);
startProcesses();
waitAllJoin();
waitForTheSystemToStabilize();
killLeaders();
writeResults();
tearDown();
}
private void init()
throws StorageInitializtionException, StorageException, IOException,
ClassNotFoundException {
LogManager.getRootLogger().setLevel(Level.INFO);
nnList = new ArrayList<LightWeightNameNode>();
LEStorageFactory.setConfiguration(driver_jar, DRIVER_CLASS,
DFS_STORAGE_DRIVER_CONFIG_FILE);
LEStorageFactory.formatStorage();
VarsRegister.registerHdfsDefaultValues();
}
private void tearDown() {
//stop all NN
LOG.info("TearDown ... ");
for (LightWeightNameNode nn : nnList) {
nn.stop();
}
}
private void startProcesses() throws InterruptedException, IOException {
//create 10 NN
Random rand = new Random(System.currentTimeMillis());
for (int i = 0; i < max_processes; i++) {
startAProcess();
//0 for no wait, -1 for random wait between [0, time_period), and > 1 for fixed wait
if (process_join_wait_time == 0) {
continue;
} else if (process_join_wait_time == -1) {
Thread.sleep(rand.nextInt(time_period));
} else if (process_join_wait_time > 0) {
Thread.sleep(process_join_wait_time);
} else {
writeMessageToFile("Unsupported process wait time. Fix process args ");
System.exit(-1);
}
}
}
private void waitAllJoin() throws InterruptedException, IOException {
final long max_wait_time = 10 * 60 * 1000;
long start_time = System.currentTimeMillis();
boolean all_processes_started = false;
while ((System.currentTimeMillis() - start_time) < max_wait_time) {
// ask the last process how many member thinks
try {
if (nnList.get(nnList.size() - 1).getActiveNameNodes().size() ==
nnList.size()) {
all_processes_started = true;
break;
}
LOG.info(
"Experiment. The last process does not have complete list of processes. Got " +
nnList.get(nnList.size() - 1).getActiveNameNodes().size() +
" expecting " + nnList.size());
Thread.sleep(1000);
} catch (NullPointerException e) {
LOG.error("Null pointer error in join");
}
}
if (!all_processes_started) {
writeMessageToFile(
"Waiting for all processes to join is taking too long ...");
System.exit(-1);
}
}
private void waitForTheSystemToStabilize()
throws IOException, InterruptedException {
final long start_time = System.currentTimeMillis();
long last_time_period = -1;
long last_time_period_change_time = -1;
boolean system_stable = false;
Random rand = new Random(System.currentTimeMillis());
while ((System.currentTimeMillis() - start_time) <
max_stabilization_wait_time) {
// ask random node about the time_period
long new_time_period =
nnList.get(rand.nextInt(nnList.size())).getLeTimePeriod();
if (last_time_period != new_time_period) {
last_time_period = new_time_period;
last_time_period_change_time = System.currentTimeMillis();
} else {
if ((System.currentTimeMillis() - last_time_period_change_time) >
consider_stable_after) {
writeMessageToFile("After join the system stabilized in " +
(System.currentTimeMillis() - start_time) +
" ms. Time period is " + last_time_period);
stable_time_period = last_time_period;
system_stable = true;
break;
}
}
Thread.sleep(1000);
LOG.info(
"Experiment. System has not yet stabilized. TP " + last_time_period +
" since " +
(System.currentTimeMillis() - last_time_period_change_time));
}
if (!system_stable) {
writeMessageToFile("The system did not stabilize ... ");
System.exit(-1);
}
}
private void killLeaders() {
try {
long last_leader_kill_time = 0;
LightWeightNameNode leader_killed = null;
int leadersKilled = 0;
LOG.info("Experiment. going to start killing nodes");
while (leadersKilled < number_of_leaders_to_kill) {
LightWeightNameNode current_leader = getCurrentLeader();
if (leader_killed == null || (current_leader != null &&
current_leader.getLeCurrentId() !=
leader_killed.getLeCurrentId())) // new leader elected
{
if (leader_killed != null) {
long failOverTime =
(System.currentTimeMillis() - last_leader_kill_time);
long failOverLowerBound = leader_killed.getLeTimePeriod();
long failOverUpperBound =
leader_killed.getLeTimePeriod() * (missed_hb_threshold + 1);
if (!(failOverTime > failOverLowerBound &&
failOverTime < failOverUpperBound)) {
//writeMessageToFile("Leader election time does not correspond to upper and lower bounds. Lower Bound: " + failOverLowerBound + " Upper Bound: " + failOverUpperBound + " Leader Failover Time: " + failOverTime);
}
writeMessageToFile("New Leader Elected. Old Leader Id " +
leader_killed.getLeCurrentId() + " new Leader Id " +
current_leader.getLeCurrentId() + " New leader elected in " +
(failOverTime));
stats.addValue(failOverTime);
}
LOG.info("Experiment. going to start a new process");
startAProcess();
LOG.info("Experiment. new process started");
writeMessageToFile("Experiment. Stopping the leader process ... Id " +
current_leader.getLeCurrentId());
long killstarttime = System.currentTimeMillis();
current_leader.stop();
while (!current_leader.getLeaderElectionInstance().isStopped()) {
Thread.sleep(1);
}
last_leader_kill_time = System.currentTimeMillis();
LOG.info("Experiment. Stopped the leader process in " +
(last_leader_kill_time - killstarttime));
leader_killed = current_leader;
leadersKilled++;
}
if (last_leader_kill_time > 0) {
long max_wait_for_fail_over = 5 * 60 * 1000;
if ((System.currentTimeMillis() - last_leader_kill_time) >
max_wait_for_fail_over) {
writeMessageToFile("Taking very long to elect a new leader ...");
System.exit(-1);
}
}
}
} catch (Exception e) {
try {
writeMessageToFile(
"Got an exception that is not properly handled " + e);
e.printStackTrace();
} catch (IOException ex) {
Logger.getLogger(Experiment1.class.getName())
.log(java.util.logging.Level.SEVERE, null, ex);
}
}
}
private void writeStartMessages(String[] argv) throws IOException {
writeMessageToFile(
"\n\n==========================================================================");
writeMessageToFile("Params " + Arrays.toString(argv));
writeMessageToFile(
"--------------------------------------------------------------------------");
}
private void writeResults() throws IOException {
writeMessageToFile(
"Experiment Finished Sucessfully. Data " + max_processes + ", " +
stable_time_period + ", " + stats.getMin() + ", " + stats.getMax() +
", " + stats.getMean() + ", " + stats.getVariance() + ", " +
stats.getStandardDeviation() + ", " +
(stats.getStandardDeviation() / Math.sqrt(stats.getN())));
writeMessageToFile("DataPoints: " + stable_time_period + " " +
Arrays.toString(stats.getValues()));
}
private LightWeightNameNode getCurrentLeader() throws IOException {
int leaderCount = 0;
LightWeightNameNode leader = null;
for (int i = nnList.size() - 1; i >= 0; i--) {
if (nnList.get(i).isLeader()) {
leaderCount++;
leader = nnList.get(i);
}
}
if (leaderCount > 1) {
writeMessageToFile("Wrong number of leaders. Found " + leaderCount);
System.exit(-1);
} else if (leaderCount == 0) {
// writeMessageToFile("No Leader Elected Yet." + leaderCount);
}
return leader;
}
public void startAProcess() throws IOException, InterruptedException {
int tries = 100;
while (tries >= 0) {
tries--;
try {
LightWeightNameNode nn =
new LightWeightNameNode(new HdfsLeDescriptorFactory(), time_period,
missed_hb_threshold, time_period_increment, HTTP_ADDRESS,
RPC_ADDRESS);
nnList.add(nn);
return;
} catch (Throwable e) {
LOG.warn("Could not create a process. Retrying (tries left " + tries +
")... Exception was " + e.getMessage());
e.printStackTrace();
Random rand = new Random(System.currentTimeMillis());
Thread.sleep(rand.nextInt(5000));
}
}
writeMessageToFile("Unable to start a process. Experiment failed ...");
System.exit(-1);
}
public void writeMessageToFile(String message) throws IOException {
LOG.info(message);
PrintWriter out = new PrintWriter(
new BufferedWriter(new FileWriter(output_file_path, true)));
out.println(message);
out.close();
}
}