package org.fi; import org.fi.*; import org.fi.FMServer.FailType; import org.fi.FMJoinPoint.*; import java.io.*; import java.util.*; // cassandra specifics public class FMLogic { private static int cachedMaxFsn = 0; // cache, don't update directly // public static final String CASS_USERNAME = "hadoop-haryadi"; public static final String CASS_USERNAME = "cassandra-fi" + System.getenv("USER"); public static final String TMPFI = "/tmp/fi/"; public static final String CASS_STORAGE_DIR = TMPFI + "cassandra/"; public static final String FAIL_HISTORY_DIR = TMPFI + "failHistory/"; public static final String FLAGS_FAILURE_DIR = TMPFI + "flagsFailure/"; public static final String EXP_RESULT_DIR = TMPFI + "expResult/"; public static final String RPC_FILES_DIR = TMPFI + "rpcFiles/"; public static final String SOCKET_HISTORY_DIR = TMPFI + "socketHistory/"; public static final String COVERAGE_COMPLETE_DIR = TMPFI + "coverageComplete/"; public static final String COVERAGE_STATIC_DIR = TMPFI + "coverageStatic/"; public static final String CASS_PIDS_DIR = TMPFI + "pids/"; public static final String CASS_LOGS_DIR = TMPFI + "logs/"; //public static final String CASS_STORAGE_DIR = TMPFI + "cassandra/"; public static final String IP_HISTORY_DIR = TMPFI + "ipHistory/"; public static final String ENABLE_FAILURE_FLAG = TMPFI + "enableFailureFlag"; public static final String CLIENT_OPTIMIZE_FLAG = TMPFI + "clientOptimizeFlag"; public static final String ENABLE_COVERAGE_FLAG = TMPFI + "enableCoverageFlag"; public static final String NODES_CONNECTED_FLAG = TMPFI + "nodesConnectedFlag"; public static final String EXPERIMENT_RUN_FLAG = TMPFI + "experimentRunning"; //JINSU public static final String EXP_PROP_DIR = TMPFI + "expProp/"; public static String FILTER_ID = ""; // ######################################################################## // ######################################################################## // ## ## // ## S E T U P S ## // ## ## // ######################################################################## // ######################################################################## // ********************************************* public FMLogic() { //JINSU hack getExpProp(); } //JINSU hack for cass corruption // private boolean debug = false; private static boolean debug = true; private static boolean isDigestReadResponse(FMAllContext fac) { if(fac.ctx.getMessageType().equalsIgnoreCase(FMClient.READ_RESPONSE_DIGEST) && fac.fjp.contains("sendOneWay")) { //System.out.println("POW POW kitty"); //System.out.println("FMLogic can run the corruption!!!"); return true; } return false; } private static boolean isDataReadResponse(FMAllContext fac) { if(fac.ctx.getMessageType().equalsIgnoreCase(FMClient.READ_RESPONSE_NORMAL) && fac.fjp.contains("sendOneWay") ) { //System.out.println("logic DRR :: " + fac.ctx.getMessageType() + " :: " + fac.fjp.contains("sendOneWay") ); return true; } return false; } // ********************************************* // the brain of fm logic begins. have fun! // ********************************************* public static FailType run(FMAllContext fac) { /* if (isDigestReadResponse(fac)) { return FailType.CORRUPTION; } */ FailType ft; // check if we need to reset anything? checkResetExperiment(); // generate all possible failures FailType [] failures = listPossibleFailures(fac); ft = tryTheseFailures(fac, failures); // DEPRECATED ... we are calling this at the client side // check if we have persistent failure, see the function's comment // ft = checkPersistentFailure(fac, ft); return ft; } // ********************************************* // do we need to reset anything? signaled by WorkloadDriver // ********************************************* private static void checkResetExperiment() { } // ********************************************* // List all possible failures, given the information about this // pointcut. It's up to the FIState model-checker and the // server filter to decide which failures that we want to exercise // later. All we want to do here is simply list all possible failures. // ********************************************* private static FailType[] listPossibleFailures(FMAllContext fac) { List<FailType> list = new ArrayList<FailType>(); boolean crash = false; boolean exception = false; boolean corruption = false; boolean baddisk = false; boolean retfalse = false; // throw exception if it's possible (must before) if (fac.fjp.getJoinExc() != JoinExc.NONE && fac.fjp.getJoinPlc() == JoinPlc.BEFORE) { exception = true; } // special case, if it's an FNF exception, it's okay // that we throw this FNF exception after -- because // in 'before' we haven't known the context yet if (fac.fjp.getJoinExc() == JoinExc.FNF) { exception = true; } // false bool if operation JoinRbl is yes (must before) if (fac.fjp.getJoinRbl() == JoinRbl.YES && fac.fjp.getJoinPlc() == JoinPlc.BEFORE) { retfalse = true; } // corruption only if iot is read (must after) /* if (fac.fjp.getJoinIot() == JoinIot.READ && fac.fjp.getJoinPlc() == JoinPlc.AFTER) { corruption = true; } */ //JINSU: change this when we are adding more corruption cases. if (isDigestReadResponse(fac) || isDataReadResponse(fac)) { corruption = true; } // crash for read and write (before or after is fine) if (fac.fjp.getJoinIot() == JoinIot.READ || fac.fjp.getJoinIot() == JoinIot.WRITE || isDataReadResponse(fac) ) { crash = true; } // baddisk if targetIO is disk (must be before) if (fac.ctx.getTargetIO().contains("hadoop") && fac.fjp.getJoinPlc() == JoinPlc.BEFORE) { // but we only want to insert baddisk if // we haven't failed this disk if (!isTargetIOaBadDisk(fac.ctx)) { baddisk = true; } } // now let's add possible failures if (crash) list.add(FailType.CRASH); if (exception) list.add(FailType.EXCEPTION); if (corruption) list.add(FailType.CORRUPTION); if (baddisk) list.add(FailType.BADDISK); if (retfalse) list.add(FailType.RETFALSE); if (list.size() == 0) return null; FailType[] failures = list.toArray(new FailType[list.size()]); if (false) { System.out.print(" Possible failures: "); for (int i = 0; i < failures.length; i++) { System.out.print(failures[i].toString() + ", "); } System.out.println("\n"); } return failures; } // ********************************************* // At this point, we might have a failure to exercise // or we don't, we'll just return this to the FMClient // but before that, we always need to check for // persistent failures (e.g. baddisk), because even // though we don't have a failure at this point, we // might have exercised a persistent failure (e.g. baddisk) // before. Hence, we want to check if the persistent // failure overwrites non-failure. // If we should exercise a failure, we just return the failure // this function only conversts FailType.NONE to // FailType.BADDISK if it's appropriate // ********************************************* public static FailType checkPersistentFailure(FMAllContext fac, FailType ft) { if (ft != FailType.NONE) { return ft; } if (isTargetIOaBadDisk(fac.ctx)) { return FailType.BADDISK; } return FailType.NONE; } // ********************************************* // from the ctx, is target an already bad disk? // if so, return true, else return false // ********************************************* private static boolean isTargetIOaBadDisk(FMContext ctx) { // if this is not a disk io return false if (!Util.isDiskIO(ctx.getTargetIO())) { return false; } // let's get the nodeId and diskId for this ctx String nodeId = ctx.getNodeId(); String diskId = Util.getDiskIdFromTargetIO(ctx.getTargetIO()); // something wrong if (diskId.equals("DiskUnknown")) return false; // check the flag file File flagFile = getBadDiskFlagFile(nodeId, diskId); if (flagFile.exists()) { return true; } return false; } // ********************************************* // for each possible failure, we want to try if we // so do the failure or not. // if ft is approved, then we should break // if not, we should continue to the next failure // ********************************************* private static FailType tryTheseFailures(FMAllContext fac, FailType [] failures) { FailType ft = FailType.NONE; if (failures == null) return ft; for (int i = 0; i < failures.length; i++) { ft = tryThisFailure(fac, failures[i]); if (ft != FailType.NONE) { break; } } return ft; } // ********************************************* // Before we insert the failure, we want to filter this // first. So check the filter. // ********************************************* private static FailType tryThisFailure(FMAllContext fac, FailType ft) { // let's build the FIState based on the failure // build the FIState FIState fis = new FIState(fac, ft); if (FMFilter.passServerFilter(fac, ft, fis)) { // if pass the server filter, we want to measure the stats // that have been filtered .. Coverage.recordStatAfterFilter(fac, ft, fis); // for the sake of recording stat, we're done .. // so no need to continue ... // just check if if (isEnableFailureFlagExist()) { FailType retFt = runFailLogic(fac, ft, fis); return retFt; } } return FailType.NONE; } // ######################################################################## // ######################################################################## // ## ## // ## C O R E L O G I C ## // ## ## // ######################################################################## // ######################################################################## // ************************************************* // This is the fail logic: logics for single crash, // multiple crashes, remembering failure history // should all go in this place // ************************************************* private static FailType runFailLogic(FMAllContext fac, FailType ft, FIState fis) { // if i have reached max fsn ... // just continue ... if (hasReachedMaxFsn()) { return FailType.NONE; } // let's get current failure number int fsn = getCurrentFsn(); // check the logic if (!shouldFail(fsn, fis)) { return FailType.NONE; } // if we reach this point we're doing failure .. recordFailure(fac, ft, fis, fsn); return ft; } // ******************************************** // Given a fsn and a hash, this is the logic: // - first we check if the fsn is locked or not // if it is locked then the hash must match // with the hash specified by the locked fsn. // otherwise, we shouldn't fail this. // - else, if fsn is not locked, then we go // to normal mode, where we check if we fail // this failure before or not // ******************************************** private static boolean shouldFail(int fsn, FIState fis) { // String tmp = String.format("fsn-%d hash-%s", fsn, hash); boolean shouldFail; if (isFsnLocked(fsn)) { if (isFsnAndHashMatched(fsn, fis.getHashId())) { shouldFail = true; } else { shouldFail = false; } } else { if (isInFailHistory(fsn, fis.getHashId())) { // System.out.format("_We have injected %d in the past_\n", // fis.getHashId()); shouldFail = false; } else { shouldFail = true; } } return shouldFail; } // ******************************************** // record the failure // ******************************************** private static void recordFailure(FMAllContext fac, FailType ft, FIState fis, int fsn) { recordInjectedFsn(fsn); recordFailHistory(fac, ft, fis, fsn); recordFailureToExperiment(fac, ft, fis, fsn); recordLatestHistory(fsn, fis); // special treatment for bad disk recordBadDisk(fac, ft); } // ######################################################################## // ######################################################################## // ## ## // ## U T I L I T Y ## // ## ## // ######################################################################## // ######################################################################## // ******************************************** // This algorithm is easy ... we're just failing // whatever we have // ******************************************** public static boolean hasReachedMaxFsn() { int maxFsn = getMaxFsn(); if (isFsnInjected(maxFsn)) { System.out.println("corrupt :: hr1 : checkpoint"); return true; } return false; } // ******************************************* public static int getMaxFsn() { if (cachedMaxFsn != 0) return cachedMaxFsn; String path = FLAGS_FAILURE_DIR + "/maxFsn"; String tmp1 = Util.fileContentToString(path); if (tmp1 == null) { Util.FATAL("maxFsn is unknown"); return 0; } tmp1 = tmp1.replaceAll("\n", ""); Integer tmp; try { tmp = new Integer(tmp1); } catch(NumberFormatException nfe) { Util.FATAL("There is no maxFsn file?"); return 0; } int maxFsn = tmp.intValue(); if (maxFsn < 1 || maxFsn > 100) { Util.FATAL("weird maxFsn " + maxFsn); } cachedMaxFsn = maxFsn; return cachedMaxFsn; } // ******************************************* private static boolean isFsnInjected(int fsn) { File f = getInjectedFsnFile(fsn); if (f.exists()) return true; return false; } // ******************************************* private static File getInjectedFsnFile(int fsn) { String path = String.format("%s/injected-fsn-%d", FLAGS_FAILURE_DIR, fsn); File f = new File(path); return f; } // ******************************************** public static int getCurrentFsn() { int fsn = 1; while (true) { if (!isFsnInjected(fsn)) return fsn; fsn++; } } // ******************************************** // if fsn is locked it means this fsn has a // specific hash that we must follow // ******************************************** private static boolean isFsnLocked(int fsn) { File f = getFsnLockFile(fsn); if (f.exists()) return true; return false; } // ******************************************** // filename: locked-fsn-# // ******************************************** private static File getFsnLockFile(int fsn) { String path = String.format("%s/locked-fsn-%d", FLAGS_FAILURE_DIR, fsn); File f = new File(path); return f; } // ******************************************** // // ******************************************** private static boolean isFsnAndHashMatched(int fsn, int hashId) { File f = getFsnAndHashFile(fsn, hashId); if (f.exists()) return true; return false; } // ******************************************** // filename: hash-for-fsn-%d-is- // ******************************************** private static File getFsnAndHashFile(int fsn, int hashId) { String path = String.format("%s/hash-for-fsn-%d-is-h%s.txt", FLAGS_FAILURE_DIR, fsn, hashId); File f = new File(path); return f; } // ******************************************** // // ******************************************** private static boolean isInFailHistory(int fsn, int hashId) { File f = getFailHistoryFile(fsn, hashId); if (f.exists()) return true; return false; } // ******************************************** // fail history file: .../failHistory/fsn-1/h-d989.txt // ******************************************** private static File getFailHistoryFile(int fsn, int hashId) { String dir = String.format("%s/fsn-%d", FAIL_HISTORY_DIR, fsn); File d = new File(dir); if (!d.exists()) { Util.mkDir(d); } String file = getHashFileName(hashId); File f = new File(d, file); return f; } // ******************************************** public static String getHashFileName(int hashId) { return String.format("h%d.txt", hashId); } // ******************************************* private static void recordInjectedFsn(int fsn) { File f = getInjectedFsnFile(fsn); Util.createNewFile(f); } // ************************************************* private static void recordBadDisk(FMAllContext fac, FailType ft) { // no need to do anything if it's not a baddisk if (ft != FailType.BADDISK) return; // if it's a bad disk .... need to remember what node and what disk .. String nodeId = fac.ctx.getNodeId(); String diskId = Util.getDiskIdFromTargetIO(fac.ctx.getTargetIO()); File flagFile = getBadDiskFlagFile(nodeId, diskId); Util.createNewFile(flagFile); } // ******************************************* private static File getBadDiskFlagFile(String nodeId, String diskId) { String fname = String.format("%s/BadDisk_%s_%s", FLAGS_FAILURE_DIR, nodeId, diskId); File f = new File(fname); return f; } // ******************************************** private static void recordFailHistory(FMAllContext fac, FailType ft, FIState fis, int fsn) { File f = getFailHistoryFile(fsn, fis.getHashId()); if (f.exists()) { // it's okay that a fail history already there // for example, in the case where fail history is locked .. return; } String buf = getFailHistoryContent(fac, ft, fis, fsn); Util.stringToFileContent(buf, f, true); } // ******************************************** public static String getFailHistoryContent(FMAllContext fac, FailType ft, FIState fis, int fsn) { String buf = ""; // print hash id first buf += "\n"; buf += "The hash ID string is: \n"; buf += "## [" + fis.getHashIdStr() + "] \n"; buf += "\n"; buf += "The hash ID is: \n"; buf += "[[" + fis.getHashId() + "]] \n"; buf += "\n"; // print all context buf += String.format("Receive sendContext: [" + fac.ctx.getCutpointRandomId() + "]\n"); buf += "\n"; if (ft != null) { buf += "FailType: **" + ft.toString() + "**\n\n"; } buf += fac.ctx + "\n"; buf += fac.fjp + "\n"; buf += fac.fst + "\n"; buf += "\n"; return buf; } // ******************************************** // we must do this because "ls -t" is not precise // ******************************************** private static void recordLatestHistory(int fsn, FIState fis) { File f = getLatestHistoryFile(fsn); Util.stringToFileContent(fis.getHashId() + "\n", f); } // ******************************************** // return the latest history file for this fsn // ******************************************** private static File getLatestHistoryFile(int fsn) { String path = String.format("%s/latest-for-fsn-%d", FAIL_HISTORY_DIR, fsn); File f = new File(path); return f; } // ******************************************** // this is the function that is dependent on workload driver // however, if expNumStr // ******************************************** private static void recordFailureToExperiment(FMAllContext fac, FailType ft, FIState fis, int fsn) { String path = FLAGS_FAILURE_DIR + "/currentExpNumber"; String expNumStr = Util.fileContentToString(path); if (expNumStr == null) { Util.WARNING("No info on experiment number, continuing"); return; } expNumStr = expNumStr.replaceAll("\n", ""); Integer expNum; try { expNum = new Integer(expNumStr); } catch(NumberFormatException nfe) { Util.EXCEPTION("Can't convert exp# ", nfe); return; } String expNumDirName = getExpNumDirName(expNum.intValue()); Util.mkDir(expNumDirName); path = String.format("%s/fsn%d-%s", expNumDirName, fsn, getHashFileName(fis.getHashId())); String buf = getFailHistoryContent(fac, ft, fis, fsn); Util.stringToFileContent(buf, path); } // ******************************************** private static String getExpNumDirName(int expNum) { return String.format("%s/exp-%05d", EXP_RESULT_DIR, expNum); } // *********************************************************** public static boolean isEnableFailureFlagExist() { File f = new File(ENABLE_FAILURE_FLAG); if (f.exists()) return true; return false; } public static void getExpProp(){ FILTER_ID = getFilterId(); } //JINSU hack: get the filter id from /tmp/fi/FILTERID file. public static String getFilterId() { String ret = Util.fileContentToString(EXP_PROP_DIR + "FILTERID"); if(ret == null) { Util.WARNING("filterId is null"); ret = ""; } return ret; } }