package org.fi; // ************************************ org fi import org.fi.*; import org.fi.FMServer.FailType; import org.fi.FMJoinPoint.*; // ************************************ jol import jol.core.JolSystem; import jol.core.Runtime; import jol.types.basic.BasicTupleSet; import jol.types.basic.Tuple; import jol.types.basic.TupleSet; import jol.types.exception.JolRuntimeException; import jol.types.exception.UpdateException; import jol.types.table.TableName; import jol.types.table.Table.Callback; import jol.types.table.Table; // ************************************ aspect import org.aspectj.lang.Signature; // include this for Signature, etc! import org.aspectj.lang.JoinPoint; import org.aspectj.lang.reflect.SourceLocation; // ************************************ java import java.io.*; import java.net.InetSocketAddress; import java.lang.Thread; import java.lang.StackTraceElement; import java.net.URL; // ************************************ XML RPC import org.apache.xmlrpc.client.XmlRpcClient; import org.apache.xmlrpc.client.XmlRpcClientConfigImpl; // FMClient send to FMServer a list of String // actually it will go to FMServer first // the list of String is basically what the event should // schedule public class FMClient { public static final int FAAS_SYSTEM_EXIT_STATUS = 13; //JINSU for cfiHooks public static String READ_RESPONSE_DIGEST = "ReadResponseDigest"; public static String READ_RESPONSE_NORMAL = "ReadResponseNormal"; public static String UNKNOWN_MESSAGE = "Unknown"; // ##################################################################### // ##################################################################### // ## ## // ## M A I N E N T R Y P O I N T S ## // ## ## // ##################################################################### // ##################################################################### // ****************************************************** // Each hook has some important properties: // - Return object: // The hook must return an object, this is important // if we want to process corruption. The aspect always // uses this returned value. By default the object is // the object we get from proceed(). // - Throws exception: // For exceptions that we want to exercise such as // IOException and FileNotFoundException, then we must // specify explicit hooks for that, so that these special // hooks can return the exceptions as specified. // - FailType.EXCEPTION // Fail type exception is processed here. Other fail types // should be proceed deeper in a centralized location. // FailType.Corruption is alredy processed by now, and // the resulting corruption should be in JoinRov // - FailType.BadDisk // ****************************************************** public static Object fiHookIox(FMJoinPoint fjp) throws IOException { FailType ft = tryFiHook(fjp); if (fjp.getIox() != null) throw fjp.getIox(); return fjp.getJoinRov(); } // ****************************************************** public static Object fiHookFnfx(FMJoinPoint fjp) throws FileNotFoundException { FailType ft = tryFiHook(fjp); if (fjp.getFnfx() != null) throw fjp.getFnfx(); return fjp.getJoinRov(); } // ****************************************************** public static Object fiHookNox(FMJoinPoint fjp) { FailType ft = tryFiHook(fjp); return fjp.getJoinRov(); } // ##################################################################### // ##################################################################### // ## ## // ## S A N I T Y C H E C K & F I L T E R I N G ## // ## ## // ##################################################################### // ##################################################################### // ****************************************************** private static FailType tryFiHook(FMJoinPoint fjp) { return syncedTryFiHook(fjp); } // ****************************************************** // first do some sanity checking and filtering // 1. check that context is not null // 2. check that fm server is connected // 3. check that we pass client filter, whatever we define here // 4. let's do actual hook // ****************************************************** private static synchronized FailType syncedTryFiHook(FMJoinPoint fjp) { //System.out.println("- stf (0)\n"); if(!isExperimentRunning()) { //System.out.println("- stf (1)\n"); return FailType.NONE; } // TODO: this is a bit wrong, because for FNFException .. // which is thrown by RAF.new FOS.new .. we haven't seen // the context so it's still null, but we might want to // throw an FNFException ... right now we're throwing FNFException // after the call RAF.nww and FOS.new if (isNullContext(fjp)) { //System.out.println("- stf (2)\n"); return FailType.NONE; } if (!passClientFilter(fjp)) return FailType.NONE; //System.out.println("- stf (3)\n"); FailType ft = doFiHook(fjp); return ft; } private static boolean isExperimentRunning() { File f = new File(FMLogic.EXPERIMENT_RUN_FLAG); if(f.exists()) return true; return false; } // ****************************************************** // context is okay to be null here because we're not weaving // this aspect.aj to all files .. so ClassWC objects generated in // non-weaved files will not have context // ****************************************************** private static boolean isNullContext(FMJoinPoint fjp) { if (fjp.getClassWC() == null) { Util.WARNING(fjp.getJoinPoint(), "null class WC at failure hook (FMClient)"); return true; } if (fjp.getClassWC().getContext() == null) { // FIXME, hack: // check fi cwc is an instance of file or not // if so let's get the absolute path .. // the reason why File doesn't have context is because // sometimes File is obtained from File.listFile() ClassWC cwc = fjp.getClassWC(); if (cwc instanceof File) { File f = (File)cwc; f.context = new Context(f.getAbsolutePath()); return false; } // Util.WARNING(fjp.getJoinPoint(), "null context at failure hook (FMClient)"); return true; } return false; } // ****************************************************** // some filtering we could do at client private static boolean passClientFilter(FMJoinPoint fjp) { boolean pass = false; // we're interested in after join place only // if (fjp.getJoinPlc() == JoinPlc.AFTER) { // pass = true; // } // DEPRECATED policy // A little logic in insertFiHook, because depending on the // context, we want to do the failure before/after. // So far, here's a little policy: // - Context = Disk, crash after // - Context = NetIO, crash before and after pass = true; return pass; } // ##################################################################### // ##################################################################### // ## ## // ## F M C L I E N T L O G I C ## // ## ## // ##################################################################### // ##################################################################### // ****************************************************** private static FailType doFiHook(FMJoinPoint fjp) { // Util.WARNING(jp, "intercepted!!!"); // **************************** // 1. prepare // prepare FST, FMC, FJP, id (all Writable) // **************************** Thread t = Thread.currentThread(); FMStackTrace fst = new FMStackTrace(t.getStackTrace()); FMContext ctx = new FMContext(fjp.getClassWC().getContext().getTargetIO()); ctx.setCutpointRandomId(); // **************************** // 2. let's get failure // REMEMBER: remember that if this hangs, then this means // that some of the args that are passed here do not have // a correct Writable read and write implementation! // So, do check each argument // **************************** FMAllContext fac = new FMAllContext(fjp, ctx, fst); FailType ft = sendContextOptimizer(fac); // **************************** // 3. now let's check if there is any persistent failure // **************************** ft = FMLogic.checkPersistentFailure(fac, ft); // **************************** // 4. let's process the failure // **************************** printFailType(fjp, fst, ctx, ft); processFailure(fjp, fst, ctx, ft); // some FailTypes (e.g. exception might not have been processsed // here, so we need to pass this on) return ft; } // ****************************************************** // Do more optimization at the client, so that we reduce // communication to the fm server // ****************************************************** private static FailType sendContextOptimizer(FMAllContext fac) { // if we have reached the max fsn .. then there is no point // we're checking this to the fm server logic // but remember we still need to check for persistent failures if (isClientOptimizeFlagExist()) { if (!FMLogic.isEnableFailureFlagExist()) { return FailType.NONE; } if (FMLogic.hasReachedMaxFsn()) { return FailType.NONE; } } FailType ft = sendContextViaXmlRpc(fac); return ft; } // *********************************************************** private static boolean isClientOptimizeFlagExist() { File f = new File(FMLogic.CLIENT_OPTIMIZE_FLAG); if (f.exists()) return true; return false; } // ****************************************************** private static void printFailType(FMJoinPoint fjp, FMStackTrace fst, FMContext ctx, FailType ft) { if (ft == FailType.NONE) return; if (!FMServer.debug) return; Util.MESSAGE("I'm failing this (see below) with FailType: " + ft); System.out.println(ctx); System.out.println(fjp); System.out.println(fst); System.out.println(""); } // ***************************************************** private static void processFailure(FMJoinPoint fjp, FMStackTrace fst, FMContext ctx, FailType ft) { if (ft == FailType.NONE) return; else if (ft == FailType.CRASH) processCrash(fjp, fst, ctx); else if (ft == FailType.CORRUPTION) processCorruption(fjp, fst, ctx); else if (ft == FailType.RETFALSE) processReturnFalse(fjp, fst, ctx); else if (ft == FailType.EXCEPTION) processException(fjp, fst, ctx); else if (ft == FailType.BADDISK) processBadDisk(fjp, fst, ctx); } // ****************************************************** private static void processCrash(FMJoinPoint fjp, FMStackTrace fst, FMContext ctx) { String pidToCrash = Util.getPid(); Util.WARNING("I'm crahing here, and should see no more output"); // 1) let's do the forceful way, use kill // String cmd = String.format("kill -s KILL %5s", pidToCrash); // String cmdout = Util.runCommand(cmd); //################################# //################################# //JINSU: We are crashing a node so all nodes aren't connected anymore. //IMPORTANT HACK //################################# //################################# org.apache.cassandra.Util.debug("In FMClient processCrash(), deleting nodeConnectedFlag"); Util.deleteNodeConnectedFlag(); // )2 or, let's do the normal way System.exit(FAAS_SYSTEM_EXIT_STATUS); Util.ERROR("if you see this, we are not crashing properly"); // if we ever see this file, we're not failing properly File f = new File(FMLogic.TMPFI + "CRASH-FAILED"); try { f.createNewFile(); } catch (Exception e) { } } // ****************************************************** private static void processCorruption(FMJoinPoint fjp, FMStackTrace fst, FMContext ctx) { Object jrov = fjp.getJoinRov(); // something is wrong, for corruption jrov should not be null if (jrov == null) { Util.FATAL("Corrupting a null Join ROV"); return; } if (jrov instanceof java.lang.Long) { long tmp = ((Long)jrov).longValue(); long tmp2; tmp2 = tmp - (tmp % 100000); tmp2 += (2 * 3600 * 1000); Long newJrov = new Long(tmp2); fjp.setJoinRov(newJrov); Util.MESSAGE("Corrupting read long from " + tmp + " to " + newJrov.longValue()); } } // ****************************************************** private static void processReturnFalse(FMJoinPoint fjp, FMStackTrace fst, FMContext ctx) { Object jrov = fjp.getJoinRov(); // something is wrong, for ret false, jrov must be null! if (jrov != null) { // if we get to this point, there must be some race condition // where at JoinPlc.BEFORE, this join point does not see // the failure, so it doesn't return a false, but then // at JoinPlc.AFTER, some other join point in other threads // is being failed (E.g. baddisk), and hence, suddently // JoinPlc.AFTER for this point "Sees" the failure, but // we have run proceed() for this joinpoint. So in this case, // we should just say an error rather than a fatal Util.ERROR("Returning false, but Join ROV is not null"); return; } Boolean newJrov = new Boolean(false); fjp.setJoinRov(newJrov); Util.MESSAGE("Returning false now ... "); } // ****************************************************** // We set the exception that we should throw later in // FMJoinPoint // ****************************************************** private static void processException(FMJoinPoint fjp, FMStackTrace fst, FMContext ctx) { if (fjp.getJoinExc() == JoinExc.IO) { fjp.setIox(new IOException("Intentional IOException from " + fjp.toString())); } else if (fjp.getJoinExc() == JoinExc.FNF) { fjp.setFnfx(new FileNotFoundException("Intentional IOException from FM")); } } // ****************************************************** // If a bad disk, then we want to see what's the // join point is about. If it's exception then baddisk // will manifest to an exception. If it's a boolean return // value, it will manifest to a false return value // ****************************************************** private static void processBadDisk(FMJoinPoint fjp, FMStackTrace fst, FMContext ctx) { if (fjp.getJoinExc() != JoinExc.NONE) processException(fjp, fst, ctx); else if (fjp.getJoinRbl() == JoinRbl.YES) processReturnFalse(fjp, fst, ctx); } // ****************************************************** // DEPRECATED -- never crash the recipient of we get // a weird dead RPC to the FM server --- because when we // crash the other datanode, the other datanode could // be inside the FM server, and the FM server cannot return // properly, so then other RPC is dead ... private static void crashRecipient(FMContext ctx) { // String nodeId = Util.getNodeIdFromNetIO(ctx.getTargetIO()); // String pidToCrash = Util.getPidFromNodeId(nodeId); // if (pidToCrash.equals("0")) { bad } // ####################################################################### // ####################################################################### // #### #### // #### X M L R P C #### // #### #### // ####################################################################### // ####################################################################### private static XmlRpcClient fmClient; // ****************************************************** private static boolean cannotConnectToServer() { if (fmClient != null) return false; connectToFMServer(); if (fmClient == null) return true; return false; } // ****************************************************** private static void connectToFMServer() { try { XmlRpcClientConfigImpl config = new XmlRpcClientConfigImpl(); String httpAddr = "http://127.0.0.1:" + FMServer.PORT + "/xmlrpc"; config.setServerURL(new URL(httpAddr)); fmClient = new XmlRpcClient(); fmClient.setConfig(config); } catch (Exception e) { fmClient = null; } } // ****************************************************** private static FailType sendContextViaXmlRpc(FMAllContext fac) { FailType ft = FailType.NONE; // this is okay, because we don't always want to // run hdfs with fm server if (cannotConnectToServer()) return ft; int randId = fac.ctx.getCutpointRandomId(); File f = Util.getRpcFile(randId); DataOutputStream dos = Util.getRpcOutputStream(randId); try { fac.write(dos); dos.close(); // System.out.format("- Sending %d \n", randId); Object[] params = new Object[]{new Integer(randId)}; Integer result = 0; result = (Integer) fmClient.execute("FMServer.sendContext", params); ft = Util.intToFailType(result); // System.out.format("- Received %d %s \n", result, ft); f.delete(); } catch (Exception e) { f.delete(); Util.EXCEPTION("RPC client error", e); // Util.FATAL("RPC client error"); // it's okay that if we cannot connect } return ft; } // ####################################################################### // ####################################################################### // #### #### // #### H A D O O P R P C #### // #### #### // ####################################################################### // ####################################################################### /* private static FMProtocol fmp = null; private static Configuration conf = new Configuration(); private static InetSocketAddress addr = new InetSocketAddress(FMServer.bindAddr, FMServer.port); // ****************************************************** private static FailType sendContextViaHadoopRPC(FMAllContext fac) { if (isNullFMServer()) return FailType.NONE; FailType ft = FailType.NONE; ft = fmp.sendContext(fac.fjp, fac.ctx, fac.fst); return ft; } // ****************************************************** private static boolean isNullFMServer() { if (fmp != null) return false; if (connectToFMServer() != null) return false; return true; } // ****************************************************** private static FMProtocol connectToFMServer() { if (fmp != null) return fmp; try { // this shouldn't be wait for proxy, if fm server is not there, continue fmp = (FMProtocol) RPC.getProxy(FMProtocol.class, FMProtocol.versionID, addr, conf); // RPC.waitForProxy(FMProtocol.class, FMProtocol.versionID, addr, conf); } catch (IOException e) { Util.WARNING("cannot contact FM"); return null; } return fmp; } */ ///////////////////////////////////////////////// ///////////////////////////////////////////////// //////////// Jinsu's Cassandra Utility ////////// ///////////////////////////////////////////////// ///////////////////////////////////////////////// public static FailType doCfiHook(FMJoinPoint fjp, Context c) { Thread t = Thread.currentThread(); FMStackTrace fst = new FMStackTrace(t.getStackTrace()); FMContext ctx = new FMContext(c.getTargetIO(), c.getMessageType()); ctx.setCutpointRandomId(); FMAllContext fac = new FMAllContext(fjp, ctx, fst); //System.out.println(fac); FailType ft = sendContextViaXmlRpc(fac); return ft; } public static void callProcessCrash() { String pidToCrash = Util.getPid(); Util.WARNING("I'm crahing here, and should see no more output"); // 1) let's do the forceful way, use kill // String cmd = String.format("kill -s KILL %5s", pidToCrash); // String cmdout = Util.runCommand(cmd); //################################# //################################# //JINSU: We are crashing a node so all nodes aren't connected anymore. //IMPORTANT HACK //################################# //################################# org.apache.cassandra.Util.debug("In FMClient callProcessCrash(), deleting nodeConnectedFlag"); Util.deleteNodeConnectedFlag(); // )2 or, let's do the normal way System.exit(FAAS_SYSTEM_EXIT_STATUS); Util.ERROR("if you see this, we are not crashing properly"); } }