package water.api.schemas3; import water.*; import water.api.API; import java.util.concurrent.ConcurrentHashMap; public class CloudV3 extends RequestSchemaV3<Iced, CloudV3> { /** * Data structure to store last tick counts from a given node. */ private static class LastTicksEntry { final public long _system_idle_ticks; final public long _system_total_ticks; final public long _process_total_ticks; LastTicksEntry(HeartBeat hb) { _system_idle_ticks = hb._system_idle_ticks; _system_total_ticks = hb._system_total_ticks; _process_total_ticks = hb._process_total_ticks; } } /** * Store last tick counts for each node. * * This is local to a node and doesn't need to be Iced, so make it transient. * Access this each time the Cloud status page is called on this node. * * The window of tick aggregation is between calls to this page (which might come from the browser or from REST * API clients). * * Note there is no attempt to distinguish between REST API sessions. Every call updates the last tick count info. */ private static transient ConcurrentHashMap<String,LastTicksEntry> ticksHashMap = new ConcurrentHashMap<>(); public CloudV3() {} // Input fields @API(help="skip_ticks", direction=API.Direction.INPUT) public boolean skip_ticks = false; // Output fields @API(help="version", direction=API.Direction.OUTPUT) public String version; @API(help="branch_name", direction=API.Direction.OUTPUT) public String branch_name; @API(help="build_number", direction=API.Direction.OUTPUT) public String build_number; @API(help="build_age", direction=API.Direction.OUTPUT) public String build_age; @API(help="build_too_old", direction=API.Direction.OUTPUT) public boolean build_too_old; @API(help="Node index number cloud status is collected from (zero-based)", direction=API.Direction.OUTPUT) public int node_idx; @API(help="cloud_name", direction=API.Direction.OUTPUT) public String cloud_name; @API(help="cloud_size", direction=API.Direction.OUTPUT) public int cloud_size; @API(help="cloud_uptime_millis", direction=API.Direction.OUTPUT) public long cloud_uptime_millis; @API(help="cloud_healthy", direction=API.Direction.OUTPUT) public boolean cloud_healthy; @API(help="Nodes reporting unhealthy", direction=API.Direction.OUTPUT) public int bad_nodes; @API(help="Cloud voting is stable", direction=API.Direction.OUTPUT) public boolean consensus; @API(help="Cloud is accepting new members or not", direction=API.Direction.OUTPUT) public boolean locked; @API(help="Cloud is in client mode.", direction=API.Direction.OUTPUT) public boolean is_client; @API(help="nodes", direction=API.Direction.OUTPUT) public NodeV3[] nodes; @API(help="internal_security_enabled", direction=API.Direction.OUTPUT) public boolean internal_security_enabled; // Output fields one-per-JVM public static class NodeV3 extends SchemaV3<Iced, NodeV3> { public NodeV3() {} @API(help="IP", direction=API.Direction.OUTPUT) public String h2o; @API(help="IP address and port in the form a.b.c.d:e", direction=API.Direction.OUTPUT) public String ip_port; @API(help="(now-last_ping)<HeartbeatThread.TIMEOUT", direction=API.Direction.OUTPUT) public boolean healthy; @API(help="Time (in msec) of last ping", direction=API.Direction.OUTPUT) public long last_ping; @API(help="PID", direction=API.Direction.OUTPUT) public int pid; @API(help="num_cpus", direction=API.Direction.OUTPUT) public int num_cpus; @API(help="cpus_allowed", direction=API.Direction.OUTPUT) public int cpus_allowed; @API(help="nthreads", direction=API.Direction.OUTPUT) public int nthreads; @API(help="System load; average #runnables/#cores", direction=API.Direction.OUTPUT) public float sys_load; // Average #runnables/#cores @API(help="System CPU percentage used by this H2O process in last interval", direction=API.Direction.OUTPUT) public int my_cpu_pct; @API(help="System CPU percentage used by everything in last interval", direction=API.Direction.OUTPUT) public int sys_cpu_pct; @API(help="Data on Node memory", direction=API.Direction.OUTPUT) public long mem_value_size; @API(help="Temp (non Data) memory", direction=API.Direction.OUTPUT) public long pojo_mem; @API(help="Free heap", direction=API.Direction.OUTPUT) public long free_mem; @API(help="Maximum memory size for node", direction=API.Direction.OUTPUT) public long max_mem; @API(help="Size of data on node's disk", direction=API.Direction.OUTPUT) public long swap_mem; @API(help="#local keys", direction=API.Direction.OUTPUT) public int num_keys; @API(help="Free disk", direction=API.Direction.OUTPUT) public long free_disk; @API(help="Max disk", direction=API.Direction.OUTPUT) public long max_disk; @API(help="Active Remote Procedure Calls", direction=API.Direction.OUTPUT) public int rpcs_active; @API(help="F/J Thread count, by priority", direction=API.Direction.OUTPUT) public short fjthrds[]; @API(help="F/J Task count, by priority", direction=API.Direction.OUTPUT) public short fjqueue[]; @API(help="Open TCP connections", direction=API.Direction.OUTPUT) public int tcps_active; @API(help="Open File Descripters", direction=API.Direction.OUTPUT) public int open_fds; @API(help="Linpack GFlops", direction=API.Direction.OUTPUT) public double gflops; @API(help="Memory Bandwidth", direction=API.Direction.OUTPUT) public double mem_bw; public NodeV3(H2ONode h2o, boolean skip_ticks) { HeartBeat hb = h2o._heartbeat; // Basic system health this.h2o = h2o.toString(); ip_port = h2o.getIpPortString(); healthy = h2o.isHealthy(); last_ping = h2o._last_heard_from; sys_load = hb._system_load_average; gflops = hb._gflops; mem_bw = hb._membw; // Memory being used mem_value_size = hb.get_kv_mem(); pojo_mem = hb.get_pojo_mem(); free_mem = hb.get_free_mem(); swap_mem = hb.get_swap_mem(); max_mem = pojo_mem + free_mem + mem_value_size; num_keys = hb._keys; // Disk health free_disk = hb.get_free_disk(); max_disk = hb.get_max_disk(); // Fork/Join Activity rpcs_active = hb._rpcs; fjthrds = hb._fjthrds; fjqueue = hb._fjqueue; // System properties & I/O Status tcps_active = hb._tcps_active; open_fds = hb._process_num_open_fds; // -1 if not available num_cpus = hb._num_cpus; cpus_allowed = hb._cpus_allowed; nthreads = hb._nthreads; pid = hb._pid; // Use tick information to calculate CPU usage percentage for the entire system and // for the specific H2O node. // // Note that 100% here means "the entire box". This is different from 'top' 100%, // which usually means one core. my_cpu_pct = -1; sys_cpu_pct = -1; if (!skip_ticks) { LastTicksEntry lte = ticksHashMap.get(h2o.toString()); if (lte != null) { long system_total_ticks_delta = hb._system_total_ticks - lte._system_total_ticks; // Avoid divide by 0 errors. if (system_total_ticks_delta > 0) { long system_idle_ticks_delta = hb._system_idle_ticks - lte._system_idle_ticks; double sys_cpu_frac_double = 1 - ((double)(system_idle_ticks_delta) / (double)system_total_ticks_delta); if (sys_cpu_frac_double < 0) sys_cpu_frac_double = 0; // Clamp at 0. else if (sys_cpu_frac_double > 1) sys_cpu_frac_double = 1; // Clamp at 1. sys_cpu_pct = (int)(sys_cpu_frac_double * 100); long process_total_ticks_delta = hb._process_total_ticks - lte._process_total_ticks; double process_cpu_frac_double = ((double)(process_total_ticks_delta) / (double)system_total_ticks_delta); // Saturate at 0 and 1. if (process_cpu_frac_double < 0) process_cpu_frac_double = 0; // Clamp at 0. else if (process_cpu_frac_double > 1) process_cpu_frac_double = 1; // Clamp at 1. my_cpu_pct = (int)(process_cpu_frac_double * 100); } } LastTicksEntry newLte = new LastTicksEntry(hb); ticksHashMap.put(h2o.toString(), newLte); } } } }