package org.fi; import org.fi.*; import org.fi.FMServer.FailType; import org.fi.FMJoinPoint.*; import java.io.*; import java.util.Random; public class ManualFI { private static int maxFailCount = 2; public ManualFI() { } public void resetFailCount(int fc) { try { File fcFile = new File("/tmp/failCount.txt"); fcFile.delete(); FileOutputStream fos = new FileOutputStream(fcFile); DataOutputStream dos = new DataOutputStream(fos); dos.writeInt(fc); dos.flush(); } catch (Exception e) { Util.EXCEPTION("at resetFailCount", e); } } // try to increment, if successful, then it's a go public boolean incrementFailCount() { try { int curFC; File fcFile = new File("/tmp/failCount.txt"); FileInputStream fis = new FileInputStream(fcFile); DataInputStream dis = new DataInputStream(fis); curFC = dis.readInt(); if (curFC < maxFailCount) { dis.close(); fis.close(); resetFailCount(curFC+1); return true; } else { return false; } } catch (Exception e) { // Util.EXCEPTION("at incrementFailCount", e); return false; } } // ************************************************* // doFail_01: write to edit log fail, reboot, then read long fail public FailType doFail_01(FMJoinPoint fjp, FMContext ctx, FMStackTrace st) { FailType fail; fail = f01a_WriteToEditLogInName1(fjp, ctx, st); if (fail != FailType.CRASH) return fail; fail = f01b_ReadLongFstimeInName1(fjp, ctx, st); if (fail != FailType.CRASH) return fail; return FailType.CRASH; } // ************************************************* // doFail_02: crash at datanode writes to data block and meta block public boolean doFail_02(FMAllContext fac) { // have I insert a single crash yet? // if not then move on .. if done, then return // File firstFailure = new File(FMLogic.FIRST_FAILURE_FLAG); // if (firstFailure.exists()) // return false; // just an optimization .. specifically we're interested // in client write and datanode failure only // hence we only "filter" data file/meta write // [file][/rhh/dfs/data1/tmp/blk_7307724612204181421] // and for network stream, node id should not be client (unknown so far) boolean passFilter = false; //if (FIState.isBlock(fac.ctx.getTargetIO())) //passFilter = true; // I only want to fail a datanode and net IO // if (Util.isNetIOtoDataNode(fac.ctx.getTargetIO()) && // fac.ctx.getNodeId().contains("DataNode")) // passFilter = true; // I only want to fail if this is a datanode if (fac.ctx.getNodeId().contains("DataNode")) passFilter = true; if (passFilter == false) return false; // build the FIState FIState fis = new FIState(fac, FailType.CRASH); // this is just an optimization, because actually // we can use the hash file to decide if we want to // fail or not // boolean isNew = FIState.addIfNew(fis); // if (!isNew) // return false; // let's check if we have injected this crash // or not in the past // if (fis.isFailedBefore()) { // System.out.format("_We have injected %d in the past_\n", // fis.getCompleteHashId()); // return false; // } // let's inject the crash, // 1) record the single-crash mode // 2) we want to remember the failure point // 3) by returning true // fis.recordToFile(); // try { // boolean rv = firstFailure.createNewFile(); // } catch (IOException e) { Util.EXCEPTION("weird", e);} return true; } // ************************************************* // doFail_03: crashing at points where a crash in // the pipeline does not cause a client to be dead // the cases here are specific ... no need to remember history // ************************************************* public boolean doFail_03(FMAllContext fac) { // basically we create a filter for doFail_02 .. // that is if (fac.ctx.getNodeId().equals("DataNode-1") && fac.fjp.contains("call(void java.io.OutputStream.write(byte[], int, int))") ) { return doFail_02(fac); } return false; } // ********************************************** public FailType f01a_WriteToEditLogInName1(FMJoinPoint fjp, FMContext ctx, FMStackTrace st) { boolean cond1, cond2, cond3, cond4; cond1 = cond2 = cond3 = cond4 = false; // cond1 .. f01a should not exist File f1 = new File("/tmp/fail01a"); if (!f1.exists()) cond1 = true; // cond2 .. f01b should not exist File f2 = new File("/tmp/fail01b"); if (!f2.exists()) cond2 = true; // second it must be the edit log under name1 // [.../dfs/name1/current/edits.new] or edits if (ctx.getTargetIO().contains("dfs/name1/current/edits")) cond3 = true; // first it must be under the context of logSync // [0] io.DataOutputBuffer (writeTo:113) // [1] namenode.EditLogOutputStream (flush:89) // [2] namenode.FSEditLog (logSync:994) // [3] namenode.FSNamesystem (mkdirs:1732) // [4] namenode.NameNode (mkdirs:553) if (st.contains("DataOutputBuffer", "writeTo") && st.contains("EditLogOutputStream", "flush") && st.contains("FSEditLog", "logSync") && st.contains("FSNamesystem", "mkdirs") && st.contains("NameNode", "mkdirs")) { cond4 = true; } // let's fail! now I need to remember this failure has happened if (cond1 && cond2 && cond3 && cond4) { File f3 = new File("/tmp/fail01a"); try { f3.createNewFile(); } catch (Exception e) { Util.ERROR(" ex 1"); }; return FailType.EXCEPTION; } return FailType.CRASH; } public FailType f01b_ReadLongFstimeInName1(FMJoinPoint fjp, FMContext ctx, FMStackTrace st) { boolean cond1, cond2, cond3, cond4, cond5; cond1 = cond2 = cond3 = cond4 = cond5 = false; // second it must be the edit log under name1 // [.../dfs/name1/current/fstime] if (ctx.getTargetIO().contains("dfs/name1/current/fstime")) cond1 = true; if (fjp.contains("DataInputStream.readLong()")) cond2 = true; // and readLong during startup only !! // SourceLoc: [575] [FSImage.java] // call(long java.io.DataInputStream.readLong()) // [0] namenode.FSImage (readCheckpointTime:575) // [1] namenode.FSImage (loadFSImage:777) // [2] namenode.FSImage (recoverTransitionRead:369) // [3] namenode.FSDirectory (loadFSImage:95) // [4] namenode.FSNamesystem (initialize:315) // [5] namenode.FSNamesystem (<init>:292) // [6] namenode.NameNode (initialize:204) // [7] namenode.NameNode (<init>:288) // [8] namenode.NameNode (createNameNode:967) // [9] namenode.NameNode (main:976) // [T] TOTAL HASH CODE: [4345152] if (st.contains("FSImage" , "readCheckpointTime") && st.contains("FSImage" , "loadFSImage") && st.contains("FSDirectory" , "loadFSImage") && st.contains("FSNamesystem" , "initialize") && st.contains("NameNode" , "initialize") && st.contains("NameNode" , "createNameNode") && st.contains("NameNode" , "main")) { cond3 = true; } // condition 4, f01a must happen first, but only if cond1-3 is true File f1 = new File("/tmp/fail01a"); if (f1.exists()) cond4 = true; File f2 = new File("/tmp/fail01b"); if (!f2.exists()) cond5 = true; // let's fail and rememer this if (cond1 && cond2 && cond3 && cond4 && cond5) { File f3 = new File("/tmp/fail01b"); try { f3.createNewFile(); } catch (Exception e) { Util.ERROR(" ex 2");}; return FailType.CORRUPTION; } return FailType.CRASH; } }