package com.yahoo.dtf.comm; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.net.InetAddress; import java.util.ArrayList; import java.util.Enumeration; import java.util.HashMap; import java.util.Iterator; import java.util.Properties; import java.util.concurrent.atomic.AtomicLong; import com.yahoo.dtf.comm.Comm; import com.yahoo.dtf.comm.CommClient; import com.yahoo.dtf.comm.CommServer; import com.yahoo.dtf.DTFConstants; import com.yahoo.dtf.DTFNode; import com.yahoo.dtf.DTFProperties; import com.yahoo.dtf.NodeInfo; import com.yahoo.dtf.NodeShutdownHook; import com.yahoo.dtf.NodeState; import com.yahoo.dtf.actions.Action; import com.yahoo.dtf.actions.component.Component; import com.yahoo.dtf.actions.flowcontrol.Sequence; import com.yahoo.dtf.comm.rpc.Node; import com.yahoo.dtf.config.Config; import com.yahoo.dtf.debug.DebugServer; import com.yahoo.dtf.exception.CommException; import com.yahoo.dtf.exception.DTFException; import com.yahoo.dtf.exception.PropertyException; import com.yahoo.dtf.logger.DTFLogger; import com.yahoo.dtf.util.HostUtils; import com.yahoo.dtf.util.SystemUtil; import com.yahoo.dtf.util.ThreadUtil; public class Comm extends Thread implements NodeShutdownHook { private static DTFLogger _logger = DTFLogger.getLogger(Comm.class); private static final String TUNNEL_CONF_FILE = "tunnel.conf"; /* * 5s heart beat all registered components every 5s, now if a heart beat * takes 50ms then in 5s we are able to heart beat at least 1000 active * components and for now that "limitation" is acceptable and well * understood. */ private long HEARTBEAT_INTERVAL = 5000; /* * Maximum amount of time that we'll tolerate without heart beating a * component is 20s */ private long HEARTBEAT_TIMEOUT = 20000; private CommServer _server = null; private boolean _running = true; private static HashMap<String, Integer> _tunnels = null; public static HashMap<String, Integer> getTunnels() { return _tunnels; } public static void addTunnel(String host, int rport, int lport) { _logger.info("Tunnel added for " + host + ":" + rport + ":" + lport); _tunnels.put(host+rport,lport); } public static boolean tunnelExists(String host, int rport) { return _tunnels.containsKey(host + rport); } private static boolean _tunneled = false; public static boolean isTunneled() { return _tunneled; } static { /* * Start off by loading any tunneling information into memory. This * information will be passed around to all nodes connected to this * controller and therefore everyone will know about all existing * tunnels. */ _tunnels = new HashMap<String, Integer>(); if (new File(TUNNEL_CONF_FILE).exists()) { FileInputStream fis = null; Properties props = null; _tunneled = true; try { fis = new FileInputStream(TUNNEL_CONF_FILE); props = new Properties(); props.load(fis); Enumeration hosts = props.keys(); while (hosts.hasMoreElements()) { String host = (String) hosts.nextElement(); Integer port = null; String portstring = props.getProperty(host); if ( portstring.indexOf('=') != -1 ) { String[] ports = portstring.split("="); port = new Integer(ports[0]); } else { port = new Integer(portstring); } host = InetAddress.getByName(host).getHostAddress(); addTunnel(host, port, port); } } catch (IOException e) { throw new RuntimeException("Unable to open/parse " + TUNNEL_CONF_FILE + " file.",e); } finally { if ( fis != null ) { try { fis.close(); } catch (IOException e) { throw new RuntimeException("Unable to close tunnel file.",e); } } } } } public Comm(Config config) throws DTFException { String heartbeat = config.getProperty("dtf.heartbeat.timeout"); if (heartbeat != null) { _logger.info("Changing default hearbeat timeout to " + heartbeat + "ms"); HEARTBEAT_TIMEOUT = new Long(heartbeat).longValue(); } String laddr = config.getProperty(DTFProperties.DTF_LISTEN_ADDR); int lport = -1; try { lport = config.getPropertyAsInt(DTFProperties.DTF_LISTEN_PORT,-1); } catch (PropertyException e) { throw new CommException("Port number bad format.",e); } if (laddr == null) { laddr = HostUtils.getHostAddress(); config.setProperty(DTFProperties.DTF_LISTEN_ADDR, laddr); } _logger.info("Host address [" + laddr + "]"); String type = config.getProperty(DTFProperties.DTF_NODE_TYPE); if (type == null) throw new CommException(DTFProperties.DTF_NODE_TYPE + " can not be null."); _server = new CommRMIServer(laddr,lport); try { _server.start(); } catch (CommException e) { throw new CommException("Unable to start CommServer.", e); } // Because the port can be selected by the RPCServer when the chosen // one is not available config.setProperty(DTFProperties.DTF_LISTEN_PORT,""+_server.getPort()); if (type.equalsIgnoreCase("dtfc")) { // DTFC node _logger.info("DTFC Setup."); // Controller node with Controller handler available _server.addHandler("node", new Node()); Action.getState().disableReplace(); } else if (type.equalsIgnoreCase("dtfa")) { // DTFA node _logger.info("DTFA Setup."); // heart beat handler for DTFC to be able to check up on each agent _server.addHandler("node", new Node()); } else { // DTFX node _logger.info("DTFX Setup."); // Any other DTF node has the basic heart beat handler up _server.addHandler("node", new Node()); } CommClient.addAgentAttribute(DTFProperties.DTF_NODE_TYPE, DTFNode.getType()); CommClient.addAgentAttribute(DTFProperties.DTF_NODE_OS, System.getProperty("os.name")); CommClient.addAgentAttribute(DTFProperties.DTF_NODE_OS_ARCH, System.getProperty("os.arch")); CommClient.addAgentAttribute(DTFProperties.DTF_NODE_OS_VER, System.getProperty("os.version")); CommClient.addAgentAttribute(DTFProperties.DTF_DEBUG_PORT, DebugServer.getInstance().getPort()); CommClient.addAgentAttribute("dtf.node.host", laddr); CommClient.addAgentAttribute("dtf.node.user", System.getProperty("user.name")); CommClient.addAgentAttribute("dtf.node.home", SystemUtil.getCWD()); if (!type.equalsIgnoreCase("dtfc")) { /* * only DTFA's will register automatically to the DTFC */ if (type.equalsIgnoreCase("dtfa")) { checkAndConnectToDTFC(); } } DTFNode.registerShutdownHook(this); } private boolean _connected = false; private Object _lockComm = new Object(); public boolean isConnected() { return _connected; } public synchronized void checkAndConnectToDTFC() throws DTFException { synchronized(_lockComm) { if (!_connected) { Config config = Action.getConfig(); String caddr = config.getProperty(DTFProperties.DTF_CONNECT_ADDR, DTFProperties.DTF_CONNECT_ADDR_DEFAULT); int cport = -1; try { cport = config.getPropertyAsInt(DTFProperties.DTF_CONNECT_PORT, DTFProperties.DTF_CONNECT_PORT_DEFAULT); } catch (PropertyException e) { throw new CommException("Port number bad format.",e); } CommClient client = new CommRMIClient(caddr, cport, _server); _clients.put("dtfc", client); _connected = true; } if ( Action.getLocalID() == null ) { getCommClient("dtfc").register(); } if ( Action.getLogger().isDebugEnabled() ) { Action.getLogger().debug("Connected [" + Action.getLocalID() + "]"); } } } public CommServer getCommServer() { return _server; } private static long lastHeartbeat = System.currentTimeMillis(); public static void heartbeat() { lastHeartbeat = System.currentTimeMillis(); } private boolean keepAlive() { long heartbeat = (System.currentTimeMillis() - lastHeartbeat); if (heartbeat > HEARTBEAT_TIMEOUT) { return false; } return true; } private static long lastmessage = 0; public void run() { long lastUpdate = System.currentTimeMillis(); while (_running) { if (_logger.isDebugEnabled()) { if (System.currentTimeMillis() - lastUpdate > 10000) { lastUpdate = System.currentTimeMillis(); Runtime rt = Runtime.getRuntime(); int MB = 1048567; _logger.debug("JVM MEMORY MAX(MB): " + (rt.maxMemory()/MB) + " FREE(MB): " + (rt.freeMemory()/MB) + " TOTAL(MB): " + (rt.totalMemory()/MB)); } } if (DTFNode.getType().equals("dtfc")) { /* * Heart-beating all registered components. */ NodeState ns = NodeState.getInstance(); //ns.checkForOrphans(); ArrayList nodes = ns.getRegisteredNodes(); for (int i = 0; i < nodes.size(); i++) { NodeInfo node = (NodeInfo)nodes.get(i); long start = System.currentTimeMillis(); Boolean result = node.getClient().heartbeat(); long stop = System.currentTimeMillis(); if (_logger.isDebugEnabled()) _logger.debug("Time to heartbeat " + node + " " + (stop-start) + "ms."); if ( !result.booleanValue() ) { /* * Avoid disconnecting a node that is executing a very * CPU/IO intensive task that is leading to having * issues heart beating this component. Use this to * identify to the end user that he/she may be over * stressing the node and that increasing the number of * agents used may be necessary to correctly execute the * currently running test. */ if ( node.getClient().isSendingAction() ) { if ( System.currentTimeMillis() - lastmessage > 5000 ) { lastmessage = System.currentTimeMillis(); _logger.warn("Node " + node + " is executing an action but not " + "heartbeating well. That agent may" + " be overwhelmed with work and you " + " may need to review your currently " + "running test case."); } continue; } /* * heart beat missed the component must have died lets * remove the node from the registeredNodes, which in turn * will unlock any components locked by this agent. */ try { // make sure that the component didn't just go away if (ns.getNodeInfo(node.getId()) != null) { _logger.info("Heartbeat missed for node: " + node + ", releasing locked components."); ns.removeNode(node); } else { _logger.info("Just avoided confusion."); } } catch (DTFException e) { _logger.error("Unable to unregister component " + node,e); } } } } else { /* * Keep Alive check for all nodes except the DTFC, this is used by * DTF nodes to know when the DTFC has gone away. The way it works * is that the DTFC will heart beat each node every 5s but if he * fails to heart beat 2 in a row then the component will just * shutdown. */ if (DTFNode.getType().equals(DTFConstants.DTFX_ID) && !Action.getComponents().hasComponents()) { // if you're a dtfx and you don't have registered components // then you have no reason to care about heartbeats } else { if ( !keepAlive() ) { if ( Node.isExecuting() || Comm.isBusy() ) { if ( System.currentTimeMillis() - lastmessage > 5000 ) { lastmessage = System.currentTimeMillis(); _logger.warn("Avoiding disconnecting because " + "it is currently running an " + "action. This is indicative of a " + "node being overwhelmed with work"); } continue; } _logger.info("DTFC failed to heartbeat for " + HEARTBEAT_TIMEOUT/1000 + "s, shutting down node."); _running = false; return; } } } ThreadUtil.pause(HEARTBEAT_INTERVAL); } } private static AtomicLong execution = new AtomicLong(0); public static boolean isBusy() { return execution.intValue() != 0; } public boolean isUp() { return _running; } public void shutdown() { if (_server != null) _server.shutdown(); Iterator iterator = _clients.keySet().iterator(); while (iterator.hasNext()) { String id = (String)iterator.next(); CommClient c = (CommClient)_clients.get(id); c.shutdown(); } _running = false; } private static HashMap<String, CommClient> _clients = new HashMap<String, CommClient>(); public static void addClient(String id, CommClient client) { _clients.put(id, client); } public static CommClient removeClient(String id) { return _clients.remove(id); } /* * XXX: temporary solution, will need to revisit this soon but for now this * is the easiest way to call back to the runner on the same thread * that originated the current action execution. This is mainly used by * tags that need to return a result back to the same calling thread. */ public Action sendActionToCaller(String id, Action action) throws DTFException { try { execution.incrementAndGet(); Sequence sequence = new Sequence(); String tid = (String)Action.getContext(Node.ACTION_DTFX_THREADID); if ( tid == null ) { tid = Thread.currentThread().getName(); } sequence.setThreadID(tid); sequence.addAction(action); return sendAction(id, sequence); } finally { execution.decrementAndGet(); } } public Action sendAction(String id, Action action) throws DTFException { try { execution.incrementAndGet(); CommClient client = getCommClient(id); return client.sendAction(id, action); } finally { execution.decrementAndGet(); } } public void executeOnComponent(String id, Action action) throws DTFException { try { execution.incrementAndGet(); executeOnComponent(id, action, false); } finally { execution.decrementAndGet(); } } public void executeOnComponent(String id, Action action, boolean nohooks) throws DTFException { try { execution.incrementAndGet(); Component component = new Component(); component.setId(id); component.addAction(action); // this isn't suppose to redo the hooks again... component.execute(nohooks); } finally { execution.decrementAndGet(); } } public CommClient getCommClient(String id) throws DTFException { CommClient client = (CommClient)_clients.get(id); if (client == null) { checkAndConnectToDTFC(); return (CommClient)_clients.get("dtfc"); } return client; } }