package org.fi; import java.io.*; import java.nio.channels.*; import java.util.*; public class Driver { public static boolean TESTING = false; //public static boolean TESTING = true; // ######################################################################## // ######################################################################## // ## ## // ## E X P P A R A M E T E S ## // ## ## // ######################################################################## // ######################################################################## public static int NUM_OF_CASS_NODES = 4; public static int BREAK_EXP_NUMBER = Parameters.BREAK_EXP_NUMBER; //public static int BREAK_EXP_NUMBER = 2; public static final int MAX_FSN = Parameters.MAX_FSN; public static boolean // enableFailure = false, enableFailure = Parameters.enableFailure, //enableOptimizer = true, enableOptimizer = false, enableCoverage = Parameters.enableCoverage, // enableCoverage = false, debug = Parameters.debug, // debug = false, // enableFrog = true, enableFrog = false, junk; public static final String FILTER_ID = Parameters.filter; public static final String CONSISTENCY = Parameters.clevel; // ######################################################################## // ######################################################################## // some locations public static final String TMPFI = "/tmp/fi/"; // dirs public static final String CASS_STORAGE_DIR = TMPFI + "cassandra/"; public static final String FAIL_HISTORY_DIR = TMPFI + "failHistory/"; public static final String RPC_FILES_DIR = TMPFI + "rpcFiles/"; public static final String FLAGS_FAILURE_DIR = TMPFI + "flagsFailure/"; public static final String EXP_RESULT_DIR = TMPFI + "expResult/"; public static final String SOCKET_HISTORY_DIR = TMPFI + "socketHistory/"; public static final String CASS_LOGS_DIR = TMPFI + "logs/"; public static final String CASS_PIDS_DIR = TMPFI + "pids/"; //jinsu for net stuff public static final String IP_HISTORY_DIR = TMPFI + "ipHistory/"; public static final String TOKENS_DIR = TMPFI + "tokens/"; public static final String EXP_PROP_DIR = TMPFI + "expProp/"; public static final String COVERAGE_COMPLETE_DIR = TMPFI + "coverageComplete/"; public static final String COVERAGE_STATIC_DIR = TMPFI + "coverageStatic/"; // files and flags public static final String FROG_OUTPUT_FILE = TMPFI + "frogOutput.txt"; public static final String RESET_FROG_FLAG = TMPFI + "resetFrogFlag"; public static final String ENABLE_FAILURE_FLAG = TMPFI + "enableFailureFlag"; public static final String ENABLE_FROG_FLAG = TMPFI + "enableFrogFlag"; public static final String CLIENT_OPTIMIZE_FLAG = TMPFI + "clientOptimizeFlag"; public static final String ENABLE_COVERAGE_FLAG = TMPFI + "enableCoverageFlag"; public static final String NODES_CONNECTED_FLAG = TMPFI + "nodesConnectedFlag"; //public static final String EXPERIMENT_RUN_FLAG = TMPFI + "experimentRunningblah"; public static final String EXPERIMENT_RUN_FLAG = TMPFI + "experimentRunning"; public static final String NODE_REBOOTING_FLAG = TMPFI + "nodesRebootingFlag"; //flag for repair workload public static final String READ_REPAIR_FLAG = TMPFI + "readRepairFlag"; // vars private static Utility u; private static Cass cass; private static int expNum = 1; private static int wipedOutNum = 1; // # of wiped out experiments public Driver() { u = new Utility(); u.setupPrintStream("/tmp/workloadOut.txt"); // don't forget to run make kill printReminder(); // connect to hdfs ... workload can use this cass = new Cass(this); } // ******************************************* // 1. prepare everthing // 2. decide if we want to enable failure or not // 3. run the recursive fsn ... // 4. final message or check // ******************************************* public void run() { // a big preparation state before we // run a set of long experiments setUpBeforeEverything(); // begin recursive fsn recursiveFsn(1); // final message setUpAfterEverything(); } // ******************************************* // prepare general stuffs .. this takes a while // ******************************************* public void setUpBeforeEverything() { createAllDirectories(); // 0 ... disable failure manager disableCoverage(); disableFailureManager(); disableClientOptimizer(); disableFrog(); // 1. make sure current working directory is correct printCwd(); // ------------------------------ clean up stuffs // 2. kill any Cassandra processes (this not always work) // so that's why we call "make kill" before running this app killCass(); // clear all rmPids(); rmLogs(); rmImages(); rmExpResult(); // .. remove previous experiment result rmSocketHistory(); // .. remove previous experiment result rmRpcFiles(); // rm all rpc files rmCoverageFiles(); rmFmStat(); rmFlags(); //JINSU rmIpHistory(); rmTokens(); rmExpProp(); // 5. clear any failure flags clearAllFlagsFailure(); // 6. clear all fail history (the directory that contains // failure hash files clearAllFailHistory(); // ------------------------------ setup stuffs // 7. write Experiment configurations/properties to a file. recordExpProp(); // 9. start the failure manager startFailureManager(); //10. start the Cassandra server startCass(); // record max fsn recordMaxFsn(); } // ******************************************* // final // ******************************************* public void setUpAfterEverything() { printAllExperimentsFinish(); } // ######################################################################## // ######################################################################## // ## ## // ## C O R E L O G I C ## // ## ## // ######################################################################## // ######################################################################## // ******************* // the recursive logic of multiple failures ... not documented right now // - unlockFsn: we want to unlock the failure sequence number for this // fsn so we can insert failures // - clearFailHistoryOfPostFsns: if we are at failures A-C-? // we want to exercise A-C-D, A-C-E ... but D and E at fsn-3 might // have been exercised before (e.g. A-B-D, A-B-E). so we want to // clear all post fsns so that we can try D and E again at fsn-3. // - if (fsn != MAX_FSN) recursiveFsn(fsn + 1) // we only want to run the workload if we get to final fsn .. because // we are indeed trying MAX_FSN of failures. // - prepareToRunWorkload .. see there // - currentFhf and lastFhf: check if we're at fsn-X, do we see // new failures or not ... if not, break .. so we go back to fsn-X-1 // and recursive since then // - lock pre-fsns ... say if we are at failures A-B-C, basically // what we want to say is in the next experiment we want to // exercise A-B-D .. so it means we must lock fsn-1 and fsn-2 // such that they will go fail A and B // ******************************************* public void recursiveFsn(int fsn) { String lastFhf = "LAST"; // keep doing until there is no new failure for this // failure sequence number while(true) { // see if we can break if (expNum > BREAK_EXP_NUMBER) break; //u.print("\n\n------------------------ fsn-%d \n\n", fsn)); // ... unlockFsn(fsn); clearFailHistoryOfPostFsns(fsn); // ... if (fsn != MAX_FSN) { recursiveFsn(fsn+1); } else { prepareToRunWorkload(); } // should we move on ? lastFhf = compareAndGetNewFhf(lastFhf, fsn); if (lastFhf == null) { // haryadi, keep continue on forever if(!TESTING) { break; } } // lock pre fsns .. lockPreFsns(fsn); } } // ******************************************* // if current is a new one, returns current // else, return null // ******************************************* public String compareAndGetNewFhf(String lastFhf, int fsn) { String currentFhf = getLatestFhf(fsn); // no failure condition if (currentFhf == null) return null; // last failure is the same as current failure // u.print(String.format("- Comparing fsn-%d last=%s current=%s \n", // fsn, currentFhf, lastFhf)); if (lastFhf.equals(currentFhf)) return null; // return the new fhf return currentFhf; } // ******************************************* public void prepareToRunWorkload() { // create experiment Experiment exp = new Experiment(this, expNum); exp.printBegin(); // prepare to run workload setupBeforeEachWorkload(exp); // run the actual workload runWorkload(exp); // setup after setupAfterEachWorkload(exp); } // ******************************************* public void setupBeforeEachWorkload(Experiment exp) { u.print("- Prepare before each workload ... \n"); recordCurrentExpNumber(exp.getExpNum()); clearAllInjectedFsn(); clearAllBadDiskFlags(); u.print("- check dead nodes (1) ...\n"); checkDeadNodes(); restartDeadDataNodes(); u.print("- check dead nodes (2) ...\n"); checkDeadNodes(); resetFrog(); // must be after restarting dead datanodes !! rmAllBlocks(); } // ******************************************* public void setupAfterEachWorkload(Experiment exp) { // done exp.printEnd(); // check if experiment sees full max_fsn or not // if not, then we have seen tested this experiment // before in lesser max_fsn, so we'll just // delete this experiment folder and does not // increment the experiment number // it doesn't hurt if we print the fail history and // check the fail experiment even though the experiment // is wiped out // this is a good expeirment that reach // now check if this experiment fails exp.checkFailExperiment(); exp.printFailHistorySummary(); if (debug) { exp.printFailHistory(); } // this experiment does not reach the max fsn if (!exp.reachMaxFsn()) { // must be after checking max fsn ... // just delete the content of the experiment dir if (!debug) { exp.rmExpDirContent(); } exp.wipeOutThisExperiment(); } else { // delete the dir content if not fail if (!exp.isFail() && !debug) { exp.rmExpDirContent(); } } rmSocketHistory(); if(BREAK_EXP_NUMBER != exp.getExpNum()) { if(TESTING) { //waiting until all the lagging message queries are completed. //u.print("waiting some time til other nodes process the messages already delivered to them...\n"); //u.sleep(5000); killNode("node1");//just kills the node using kill command // jinsu : then I need to remove the data because we want the node to start at a clean state // sometimes if data isn't deleted before the node is restarted, its commitLog gets corrupted. //u.deleteDir(CASS_STORAGE_DIR + "node1"); //killNode("node2"); //u.deleteDir(CASS_STORAGE_DIR + "node2"); u.sleep(60000); u.print("node1 has been dead and we waited 60 seconds!\n"); System.exit(0); checkDeadNodes(); restartDeadDataNodes(); } else { //For Insert Workload I don't need to do anything ... //rmPids(); //rmLogs(); rmImages(); rmRpcFiles(); //rmIpHistory(); } //u.mkDir(CASS_STORAGE_DIR); } //u.println("checking DeadNodes after worklaod"); //checkDeadNodes(); // increment the experiment number incrementExpNum(); // and remove all ports otherwise the directory gets too big !!! // and it could contain thousands of files } // ******************************************* // check the algorithm below // ******************************************* public void runWorkload(Experiment exp) { // the experiment runs here ... // here, we can write whatever experiments we want // so now, I've created the workload client write class // which will run the client write workloads //remove this later to enable insert workload. //ClientInsertWorkload ciw = new ClientInsertWorkload(this, exp); //ciw.run(); selectWorkloadToRun(FILTER_ID, exp); //ClientReadRepairWorkload crrw = new ClientReadRepairWorkload(this, exp); //crrw.run(); } //JINSU : for configurable experiments public void selectWorkloadToRun(String filter, Experiment exp) { String delimiter = "\\d"; String[] temp; temp = filter.split(delimiter); String experiment_name = temp[0]; //System.out.println("######## experiment name = " + experiment_name); if(experiment_name.equalsIgnoreCase("readrepair")) { ClientReadRepairWorkload crrw = new ClientReadRepairWorkload(this, exp); crrw.run(); } else if(experiment_name.equalsIgnoreCase("insertion")) { ClientInsertWorkload ciw = new ClientInsertWorkload(this, exp); ciw.run(); } else { //default case is read repair workload ClientReadRepairWorkload crrw = new ClientReadRepairWorkload(this, exp); crrw.run(); } } // ******************************************* // unlock this fsn so the hash for this fsn is free // unlock this and aosl delete the fsn hash file // ******************************************* public void unlockFsn(int fsn) { u.print("- Unlocking fsn-" + fsn + "\n"); File f = getFsnLockFile(fsn); u.deleteFile(f); String hashPrefix = "hash-for-fsn-" + fsn; u.deleteDirContent(FLAGS_FAILURE_DIR, hashPrefix); } // ******************************************* // this is optional if we want to repeat a failure // but in difference zone ... try to disable this // and see what will happen // ******************************************* public void clearFailHistoryOfPostFsns(int currentFsn) { u.print(String.format ("- Clearing fail history of post fsns %d - %d \n" , currentFsn+1, MAX_FSN)); for (int i = currentFsn+1; i <= MAX_FSN; i++) { clearFailHistory(i); } } // ******************************************* // .../failHistory/fsn-1/ // if we clear fail history we must also clear the latest history // of this fsn // ../failHistory/latest-for-fsn-1 // ******************************************* public void clearFailHistory(int fsn) { u.print("- Clearing fail history of fsn-" + fsn + "\n"); String path = String.format("%s/fsn-%d", FAIL_HISTORY_DIR, fsn); if (!u.deleteDir(path)) { u.ERROR("Can't delete " + path); } File f = getLatestHistoryFile(fsn); u.deleteFile(f); } // ******************************************* // .../flagsFailure/injected-fsn-* // ******************************************* public void clearAllInjectedFsn() { u.print("- Clearing all injected fsns ...\n"); u.deleteDirContent(FLAGS_FAILURE_DIR, "injected-fsn-"); } // ******************************************* // .../flagsFailure/BadDisk_.. // ******************************************* public void clearAllBadDiskFlags() { u.print("- Clearing all injected fsns ...\n"); u.deleteDirContent(FLAGS_FAILURE_DIR, "BadDisk"); } // ******************************************* // check if pre fsns are not locked then we want to // lock them // ******************************************* public void lockPreFsns(int currentFsn) { u.print(String.format("- Locking pre fsns %d - %d \n", 1, currentFsn)); for (int i = 1; i < currentFsn; i++) { lockFsn(i); } } // ******************************************* // lockFsn comprises of adding the flag, // and also adding the latest failure hash id // ******************************************* public void lockFsn(int fsn) { u.print("- Locking fsn-" + fsn + "\n"); File f = getFsnLockFile(fsn); u.createNewFile(f); String latestFhf = getLatestFhf(fsn); if (latestFhf == null) { u.FATAL("lockFsn logic error"); } f = getFsnAndHashFile(fsn, latestFhf); u.createNewFile(f); } // ******************************************* private void recordCurrentExpNumber(int expNum) { String path = FLAGS_FAILURE_DIR + "/currentExpNumber"; String tmp = String.format("%d", expNum); u.stringToFileContent(tmp, path); } // ######################################################################## // ######################################################################## // ## ## // ## U T I L I T Y ## // ## ## // ######################################################################## // ######################################################################## // ******************************************* private static void incrementExpNum() { expNum++; } // ******************************************* public static int getWipedOutNum() { return wipedOutNum; } // ******************************************* public static void incrementWipedOutNum() { wipedOutNum++; } // ******************************************** // filename: locked-fsn-# // ******************************************** private File getFsnLockFile(int fsn) { String path = String.format("%s/locked-fsn-%d", FLAGS_FAILURE_DIR, fsn); File f = new File(path); return f; } // ******************************************** // return the latest history file for this fsn private File getLatestHistoryFile(int fsn) { String path = String.format("%s/latest-for-fsn-%d", FAIL_HISTORY_DIR, fsn); File f = new File(path); return f; } // *************************************************** public String getLatestFhf(int fsn) { File f = getLatestHistoryFile(fsn); if (!f.exists()) { return null; } String tmp = u.fileContentToString(f); if (tmp == null) return tmp; tmp = tmp.replaceAll("\n", ""); return tmp; } // ******************************************** // filename: hash-for-fsn-%d-is- // ******************************************** private File getFsnAndHashFile(int fsn, String hash) { String path = String.format("%s/hash-for-fsn-%d-is-h%s.txt", FLAGS_FAILURE_DIR, fsn, hash); File f = new File(path); return f; } public void printCwd() { String curDir = System.getProperty("user.dir"); u.println(String.format("- Current directory is " + curDir)); } // ******************************************* public void rmCoverageFiles() { u.print("- Removing Coverage files ...\n"); if (!u.deleteDirContent(COVERAGE_COMPLETE_DIR)) { u.ERROR("Can't delete " + COVERAGE_COMPLETE_DIR); } if (!u.deleteDirContent(COVERAGE_STATIC_DIR)) { u.ERROR("Can't delete " + COVERAGE_COMPLETE_DIR); } } // ******************************************* // remove all files recursively starting from CASS_STORAGE_DIR // ******************************************* public void rmImages() { u.print("- Removing images ...\n"); if (!u.deleteDir(CASS_STORAGE_DIR)) { u.ERROR("Can't delete " + CASS_STORAGE_DIR); } } // ******************************************* // remove all files recursively // ******************************************* public void rmExpResult() { u.print("- Removing previous experiment results ...\n"); if (!u.deleteDirContent(EXP_RESULT_DIR)) { u.ERROR("Can't delete " + EXP_RESULT_DIR); } } // ******************************************* // remove sockethistory // ******************************************* public void rmSocketHistory() { u.print("- Removing socket history ...\n"); if (!u.deleteDirContent(SOCKET_HISTORY_DIR)) { u.ERROR("Can't delete " + SOCKET_HISTORY_DIR); } } // ******************************************* // remove rpc files // ******************************************* public void rmRpcFiles() { u.print("- Removing RPC files ...\n"); if (!u.deleteDirContent(RPC_FILES_DIR)) { u.ERROR("Can't delete " + RPC_FILES_DIR); } } // ******************************************* // clear failure flags flags // ******************************************* public void clearAllFlagsFailure() { u.print("- Clearing all failure flags ...\n"); if (!u.deleteDirContent(FLAGS_FAILURE_DIR)) { u.ERROR("Can't delete " + FLAGS_FAILURE_DIR); } } // ******************************************* // reset frog // this is a stupid but fast way to reset frog // just create a new flag /tmp // ******************************************* public void resetFrog() { u.print("- Resetting frog ...\n"); File f = new File(RESET_FROG_FLAG); try { f.createNewFile(); } catch (Exception e) { u.ERROR("can't create " + f.getAbsolutePath()); } } // ******************************************* // remove all files inside the FAIL_HISTORY_DIR dir // ******************************************* public void clearAllFailHistory() { u.print("- Removing all fail history ...\n"); if (!u.deleteDirContent(FAIL_HISTORY_DIR)) { u.ERROR("Can't delete " + FAIL_HISTORY_DIR); } } // ******************************************* // rm all logs file .. // ******************************************* public void rmLogs() { u.print("- Removing logs ...\n"); if (!u.deleteDirContent(CASS_LOGS_DIR)) { u.ERROR("Can't delete " + CASS_LOGS_DIR); } } // ******************************************* // rm all pid file .. // ******************************************* public void rmPids() { u.print("- Removing logs ...\n"); if (!u.deleteDirContent(CASS_PIDS_DIR)) { u.ERROR("Can't delete " + CASS_PIDS_DIR); } } // ******************************************* // rm all ipHistory file .. // ******************************************* public void rmIpHistory() { u.print("- Removing ipHistory ...\n"); if (!u.deleteDirContent(IP_HISTORY_DIR)) { u.ERROR("Can't delete " + IP_HISTORY_DIR); } } // ******************************************* // rm all tokens file .. // ******************************************* public void rmTokens() { u.print("- Removing Tokens ...\n"); if (!u.deleteDirContent(TOKENS_DIR)) { u.ERROR("Can't delete " + TOKENS_DIR); } } public void rmExpProp() { u.print("- Removing ExpProps ...\n"); if (!u.deleteDirContent(EXP_PROP_DIR)) { u.ERROR("Can't delete " + EXP_PROP_DIR); } } // ******************************************* // start failure manager // ******************************************* public void startFailureManager() { u.print("- Starting Failure Manager ...\n"); String cmdout = u.runCommand("bin/cfi"); u.print(cmdout); u.print("\n\n"); } // ******************************************* // record max fsn, so the fm knows the max fsn // ******************************************* public void recordMaxFsn() { String path = FLAGS_FAILURE_DIR + "/maxFsn"; String tmp = String.format("%d", MAX_FSN); u.stringToFileContent(tmp, path); } // ******************************************* // enable failure manager via the fmadmin command // see my bin/hadoop to find what this is public static void enableFailureManager() { if (!Driver.enableFailure) return; u.print("- Enabling Failure Manager ...\n"); u.createNewFile(ENABLE_FAILURE_FLAG); } // ******************************************* public static void enableFrog() { if (!Driver.enableFrog) return; u.print("- Enabling Frog ...\n"); u.createNewFile(ENABLE_FROG_FLAG); } // ******************************************* public static void disableFrog() { u.print("- Disabling Frog ...\n"); u.deleteFile(ENABLE_FROG_FLAG); } // ******************************************* public static void enableCoverage() { if (!Driver.enableCoverage) return; u.print("- Enabling Coverage ...\n"); u.createNewFile(ENABLE_COVERAGE_FLAG); } // ******************************************* public static void disableCoverage() { u.print("- Disabling Coverage...\n"); u.deleteFile(ENABLE_COVERAGE_FLAG); } // ******************************************* public static void enableClientOptimizer() { if (!Driver.enableOptimizer) return; u.print("- Optimizing FM Client ...\n"); u.createNewFile(CLIENT_OPTIMIZE_FLAG); } // ******************************************* public static void disableClientOptimizer() { u.print("- unOptimizing FM Client ...\n"); u.deleteFile(CLIENT_OPTIMIZE_FLAG); } // ******************************************* public static void disableFailureManager() { u.print("- Disabling Failure Manager ...\n"); u.deleteFile(ENABLE_FAILURE_FLAG); } // ******************************************* public void rmFmStat() { u.deleteFile("/tmp/fmStat.txt"); } // ******************************************* public void rmFlags() { u.print("- Deleting flags ...\n"); u.deleteFile(Driver.NODES_CONNECTED_FLAG); //u.deleteFile(Driver.EXPERIMENT_RUN_FLAG); } // ******************************************* // just call start-cass public void startCass() { u.print("- Starting Cassandra ...\n"); u.print(new Date(System.currentTimeMillis()) + " - sC(1) " + u.diff() +"\n"); String cmdout = u.runCommand("bin/allCnode " + NUM_OF_CASS_NODES); u.print(new Date(System.currentTimeMillis()) + " - sC(2) " + u.diff() +"\n"); u.print(cmdout); u.print("\n\n"); /* String cmdout = u.runCommand("bin/cassandra -p 0"); u.print(cmdout); u.print("\n\n"); for (int i = 1; i < Driver.NUM_OF_CASS_NODES; i++) { cmdout = u.runCommand("bin/cnode " + i); u.print(cmdout); u.print("\n\n"); } */ cass.assertConnection(); } // ******************************************* // killall cass processes (not always work) public void killCass() { u.print("- Killing Cassandra nodes ...\n"); //jinsu change made cass.client = null; u.deleteFile(NODES_CONNECTED_FLAG); //u.MESSAGE(" Killing cassandra, FIXME \n"); NodeProcess[] nps = getNodeProcesses(); if (nps == null) return; for (int i = 0; i < nps.length; i++) { String cmd = String.format("kill -s KILL %5s", nps[i].getPid()); u.print(String.format(" %s, %s \n", cmd, nps[i].getName())); String cmdout = u.runCommand(cmd); } u.print("\n\n"); } // ********************************************** // jinsu kill one node using kill command. // you should and clean up data, pidfile public void killNode(String nodeId) { NodeProcess[] nps = getNodeProcesses(); String killPid = ""; for(NodeProcess node : nps) { if(node.getName().equals(nodeId)) { killPid = u.getPidFromTmpPid(new File(node.getTmpPidFile())); String cmd = String.format("kill -s KILL %5s", killPid); String cmdOut = u.runCommand(cmd); u.print(cmdOut+"\n"); } } if(nodeId.equals("node0")) { cass.client = null; } u.deleteFile(NODES_CONNECTED_FLAG); } // ******************************************* // a bit stupid method to find out if a datanode is dead or not // just do ps -p pid .. then search if there is the word java in it or not public void checkDeadNodes() { u.print("- Checking dead nodes ...\n"); NodeProcess[] nps = getNodeProcesses(); if (nps == null) return; for (int i = 0; i < nps.length; i++) { boolean isAlive = u.isPidAlive(nps[i].getPid()); u.print(String.format(" %-5s %-15s ", nps[i].getPid(), nps[i].getName())); if (isAlive) { u.print("ok \n"); } else { u.print("DEAD \n"); } } u.print("\n\n"); } // ******************************************* // restart dead datanodes ... // go through each pid in /tmp/hadoop..pid // and find which pid is dead // FIXME // ******************************************* public void restartDeadDataNodes() { u.print("- Restarting dead nodes ...\n"); //u.MESSAGE("FIXME: restart dead datanodes \n"); NodeProcess[] nps = getNodeProcesses(); if (nps == null) return; for (int i = 0; i < nps.length; i++) { boolean isAlive = u.isPidAlive(nps[i].getPid()); if (isAlive) continue; // continue if it's alive String s = String.format(" Restarting %-15s %s \n", nps[i].getName(), nps[i].getPid()); u.print(s); // before restarting, make sure we remove // all stuffs that relate to this dead datanode // such as the pid file, and log files // first I need to remove the tmp pid file u.deleteFile(nps[i].getTmpPidFile()); // then I need to remove the logs rmNodeLogFile(nps[i].getName()); //JINSU //TODO: Sometimes, we need to clear the commitLogs or data in the dead node because commitLogs get corrupted at times. //need to call u.deleteDirContent(CASS_STORAGE_DIR+nps[i].getName()) but it shouldn't be called for rebootWorkload. u.createNewFile(NODE_REBOOTING_FLAG); //u.stringToFileContent(nps[i].get); // let's resetart the datanode restartNode(nps[i].getName()); u.deleteFile(NODE_REBOOTING_FLAG); // okay so we must wait until that datanode is registered u.print(" Waiting for registration ...\n"); waitForNodeRegistration(nps[i].getName()); } u.createNewFile(NODES_CONNECTED_FLAG); u.sleep(1000); u.print("\n\n"); } // ******************************************* // restart the datanode with dnId // in my case I could do this by callng // e.g. "pdatanode -3" .. in thanh's case you // must set the conf folder properly public void restartNode(String nodeName) { // or, bin/hd.sh --config conf start pdatanode -1 String nodeNum = nodeName.replace("node", ""); if (nodeNum.equals("0")) { cass.client = null; //TODO : generalize this because later on, node0 might not be the only one the client connects to. String cmd = String.format("bin/cassandra -p %s \n", nodeNum); String cmdout = u.runCommand(cmd); } else { String cmd = String.format("bin/cnode %s \n", nodeNum); String cmdout = u.runCommand(cmd); } } //********************************************* // a helper function to get the total number of // alive Nodes for waitForNodeRegistration //********************************************* public int getNumAliveNodes(NodeProcess[] nps) { int num = 0; for (int i=0; i < nps.length; i++) { boolean isAlive = u.isPidAlive(nps[i].getPid()); if (isAlive) num++; } return num; } //JINSU: maybe more general than getNumAliveNodes public NodeProcess[] getLiveNodes(NodeProcess[] nps) { LinkedList<NodeProcess> liveNps = new LinkedList<NodeProcess>(); for (int i=0; i < nps.length; i++) { boolean isAlive = u.isPidAlive(nps[i].getPid()); if (isAlive) liveNps.add(nps[i]); } return (NodeProcess[]) liveNps.toArray(new NodeProcess[liveNps.size()]); } // ******************************************* // a stupid but working method to find out if // a datanode has been registered or not. // we can detect successful registration by checking that // "is now part of the cluster." exists in the log file // ex. waitForNodeRegistration("node0") => wait for node0 to see other live nodes. public void waitForNodeRegistration(String nodeName) { //u.print("-wFNR(1)" + u.diff() + "\n"); int nodeNum = Integer.parseInt(nodeName.replace("node", "")); //u.print("-wFNR(2)" + u.diff() + "\n"); //String cmd = String.format("grep -a %s %s", "cluster", CASS_LOGS_DIR + "node" + nodeNum + ".log"); String cmd = String.format("grep -a %s %s", "cassReady", CASS_LOGS_DIR + "node" + nodeNum + ".log"); //cmd[2] = cmd[2].replace("_", " "); //u.print("size of the cmd is " + cmd.length + "\n"); boolean connecting = true; String cmdOut = ""; String pattern = "127.0.0.1"; NodeProcess[] nps = getNodeProcesses(); int numAlive = getNumAliveNodes(nps); //u.print("-wFNR(3)" + u.diff() + "\n"); while(connecting) { //u.print("-wFNR(4)" + u.diff() + "\n"); String logFile = getLogFileFromNodeName(nodeName); if(logFile != null) { //u.print("-wFNR(5)" + u.diff() + "\n"); cmdOut = u.runCommand(cmd); //u.print("cmdOut\n-- " + cmdOut); int contains = 0; //u.print("-wFNR(6)" + u.diff() + "\n"); for(int i = 0; i < NUM_OF_CASS_NODES; i++) { //u.print("-wFNR(6.5)" + u.diff() + "\n"); //if (!u.isPidAlive(nps[i].getPid())) { // continue; //} //u.print("-wFNR(6.6)" + u.diff() + "\n"); if(i == nodeNum) continue; //i == nodeNum, no need to check because i'm checking whether myself is up //and we are checking whehter other nodes are up... if(i == 0) { //special case for node0...127.0.0.1 if(cmdOut.contains(pattern)) { contains++; } else { break; } } else { if(cmdOut.contains(pattern + i)) { contains++; } else { break; } } } //u.print("-wFNR(7)" + u.diff() + "\n"); //u.print("wFNR comparison :::: getNumAliveNodes = " + getNumAliveNodes(nps) + "... numAlive = " + numAlive + "... contains = " + contains + "\n"); if(contains == (numAlive - 1)) { //NUM_OF_CASS_NODES - 1 because you don't check yourself connecting = false; } //u.print("-wFNR(8)" + u.diff() + "\n"); } else { u.print(nodeName + " log doesn't exist...\n"); } //u.print(cmdOut + "\n"); u.print("- (Waking up) Waiting for Node" + nodeNum + " to see all other nodes...\n"); u.sleep(1000); } //Checking for Token size to be correct //TODO : put this token check back in. //Taking it temporarily for easy porting for now //Jin-Su Oct/6/2010 //Jin-Su Nov/22/2010 String tokenCmd = String.format("grep -a %s %s", "TokenSizeTest", TOKENS_DIR + "nodeToken" + nodeNum); //boolean waiting = true; while(true) { String tokenCmdOut = u.runCommand(tokenCmd); u.print(":::: checking token size ::::"); u.print(tokenCmdOut+"\n"); //tokenCmdOut. if(tokenCmdOut.contains(new Integer(numAlive).toString())) { break; } u.print("- (Waiting for TokenSizeTest)...\n"); u.sleep(1000); } u.print("- Node"+nodeNum+" is alive\n"); } // ******************************************* // rm all log files related to this datanode public void rmNodeLogFile(String nodeName) { String logFile = getLogFileFromNodeName(nodeName); if (logFile != null) u.deleteFile(CASS_LOGS_DIR, logFile); String outFile = getOutFileFromNodeName(nodeName); if (outFile != null) u.deleteFile(CASS_LOGS_DIR, outFile); } // ******************************************* // get the log file for this datanode public String getLogFileFromNodeName(String nodeName) { File dir = new File(CASS_LOGS_DIR); String[] c = dir.list(); for (int i=0; i< c.length; i++) { if (c[i].contains(nodeName + ".log")) return c[i]; } return null; } // ******************************************* // get the output file for this datanode public String getOutFileFromNodeName(String nodeName) { File dir = new File(CASS_LOGS_DIR); String[] c = dir.list(); for (int i=0; i< c.length; i++) { if (c[i].contains(nodeName + ".out")) return c[i]; } return null; } // ******************************************* // get a list of node processes from tmp-pids // see the NodeProcess class // ******************************************* public NodeProcess[] getNodeProcesses() { LinkedList<NodeProcess> list = new LinkedList<NodeProcess>(); for (int i=0; i<NUM_OF_CASS_NODES; i++) { String tmpPidFile = String.format("%s/node%d.pid", CASS_PIDS_DIR, i); String name = String.format("node%d", i); File f = new File(tmpPidFile); String pid = u.getPidFromTmpPid(f); NodeProcess np = new NodeProcess(tmpPidFile, pid, name); list.add(np); } return (NodeProcess[]) list.toArray(new NodeProcess[list.size()]); } // ******************************************* // is this a tmp-pid file? public boolean isTmpPid(String fname) { u.MESSAGE("FIXME isTmpPid \n"); return false; //if (fname.contains(CASS_USERNAME) && fname.contains(".pid")) //return true; //return false; } // ******************************************* // current failure hash file is the most recent // fail history (use "ls -t" to grep the file) public String getCurrentFailureHashFile() { String cmd = String.format("ls -t %s", FAIL_HISTORY_DIR); String cmdout = u.runCommand(cmd); String [] split = cmdout.split("\n", 2); String latest = split[0]; String dotTxt = ".txt"; // sanity check if (latest.indexOf("h") == 0 && latest.indexOf(dotTxt) == latest.length() - dotTxt.length()) { return latest; } // it's possible that if we don't inject failure // there is no has failure, let's just put this to something u.WARNING("getLatestHashedFailure returns " + latest); return null; } // ******************************************* // IMPORTANT !!!! // the reason we want to remove all blocks is because // we don't want to have other background traffics // this is a bad hack .. but let's do it for now public void rmAllBlocks() { u.print("- Removing all blocks ...(OLD HDFS stuff, FIX ME if needed)\n"); } // ******************************************* Utility getUtility() { return u; } // ******************************************* Cass getCass() { return cass; } // ******************************************* // print done ... public void printAllExperimentsFinish() { String full = ("## ################################################# ##\n"); String side = ("## ##\n"); String middle = String.format ("## A L L E X P E R I M E N T S F I N I S H !!! ##\n"); u.print("\n\n"); u.print(full); u.print(full); u.print(side); u.print(middle); u.print(side); u.print(full); u.print(full); u.print("\n\n"); } public static void waiting (int n){ long t0, t1; t0 = System.currentTimeMillis(); do{ t1 = System.currentTimeMillis(); } while ((t1 - t0) < (n * 1000)); } // ******************************************* public void createAllDirectories() { u.mkDir(TMPFI); u.mkDir(EXP_RESULT_DIR); u.mkDir(FAIL_HISTORY_DIR); u.mkDir(COVERAGE_COMPLETE_DIR); u.mkDir(COVERAGE_STATIC_DIR); u.mkDir(FLAGS_FAILURE_DIR);; u.mkDir(CASS_LOGS_DIR); u.mkDir(RPC_FILES_DIR); u.mkDir(SOCKET_HISTORY_DIR); u.mkDir(CASS_PIDS_DIR); //jinsu for net contextPassing u.mkDir(IP_HISTORY_DIR); //jinsu for checking tokenSize u.mkDir(TOKENS_DIR); //jinsu for experiment configurations u.mkDir(EXP_PROP_DIR); } // ******************************************* public void printReminder() { u.print("## ############################################# ## \n"); u.print("## ############################################# ## \n"); u.print("## ## \n"); u.print("## DON'T FORGET TO RUN: make kill ## \n"); u.print("## ## \n"); u.print("## ############################################# ## \n"); u.print("## ############################################# ## \n"); // don't forget to run make kill (just make sure no java process // before this) } public void recordExpProp() { //writing filter value to a file. recordFilterId(); } // ******************************************* public void recordFilterId() { String fpath = EXP_PROP_DIR + "FILTERID"; boolean succ = u.stringToFileContent(FILTER_ID, fpath); if(!succ) { u.ERROR("Can't create " + fpath); } } public static void main(String[] args) { } }