package water; import java.net.InetAddress; import java.net.UnknownHostException; import sun.misc.Unsafe; import water.nbhm.UtilUnsafe; import water.util.Log; /** * Maintain a VERY efficient list of events in the system. This must be VERY * cheap to call, as it will get called alot. On demand, we can snapshot this * list gather all other lists from all other (responsive) Nodes, and build a * whole-Cloud timeline for dumping. * * * @author <a href="mailto:cliffc@h2o.ai"></a> * @version 1.0 */ public class TimeLine extends UDP { private static final Unsafe _unsafe = UtilUnsafe.getUnsafe(); // The TimeLine buffer. // The TimeLine buffer is full of Events; each event has a timestamp and some // event bytes. The buffer is a classic ring buffer; we toss away older // events. We snapshot the buffer by replacing it with a fresh array. The // index of the next free slot is kept in the 1st long of the array, and // there are MAX_EVENTS (a power of 2) more slots. // A TimeLine event is: // - Milliseconds since JVM boot; 4 bytes // - IP4 of send/recv // - Sys.Nano, 8 bytes-3 bits // - Nano low bit is 1 id packet was droped, next bit is 0 for send, 1 for recv, next bit is 0 for udp, 1 for tcp // - 16 bytes of payload; 1st byte is a udp_type opcode, next 4 bytes are typically task# public static final int MAX_EVENTS=2048; // Power-of-2, please static final int WORDS_PER_EVENT=4; static final long[] TIMELINE = new long[MAX_EVENTS*WORDS_PER_EVENT+1]; static long JVM_BOOT_MSEC = System.currentTimeMillis(); // Snapshot and return the current TIMELINE array private static long[] snapshot() { return TIMELINE.clone(); } // CAS access to the TIMELINE array private static final int _Lbase = _unsafe.arrayBaseOffset(long[].class); private static final int _Lscale = _unsafe.arrayIndexScale(long[].class); private static long rawIndex(long[] ary, int i) { assert i >= 0 && i < ary.length; return _Lbase + i * _Lscale; } private static boolean CAS( long[] A, int idx, long old, long nnn ) { return _unsafe.compareAndSwapLong( A, rawIndex(A,idx), old, nnn ); } // Return the next index into the TIMELINE array private static int next_idx( long [] tl ) { // Spin until we can CAS-acquire a fresh index while( true ) { int oldidx = (int)tl[0]; int newidx = (oldidx+1)&(MAX_EVENTS-1); if( CAS( tl, 0, oldidx, newidx ) ) return oldidx; } } // Record 1 event, the first 16 bytes of this buffer. This is expected to be // a high-volume multi-thread operation so needs to be fast. "sr" is send- // receive and must be either 0 or 1. "drop" is whether or not the UDP // packet is dropped as-if a network drop, and must be either 0 (kept) or 2 // (dropped). private static void record2( H2ONode h2o, long ns, boolean tcp, int sr, int drop, long b0, long b8 ) { final long ms = System.currentTimeMillis(); // Read first, in case we're slow storing values long deltams = ms-JVM_BOOT_MSEC; assert deltams < 0x0FFFFFFFFL; // No daily overflow final long[] tl = TIMELINE; // Read once, in case the whole array shifts out from under us final int idx = next_idx(tl); // Next free index tl[idx*WORDS_PER_EVENT+0+1] = (deltams)<<32 | (h2o.ip4()&0x0FFFFFFFFL); tl[idx*WORDS_PER_EVENT+1+1] = (ns&~7)| (tcp?4:0)|sr|drop; // More complexities: record the *receiver* port in the timeline - but not // in the outgoing UDP packet! The outgoing packet always has the sender's // port (that's us!) - which means the recorded timeline packet normally // never carries the *receiver* port - meaning the sender's timeline does // not record who he sent to! With this hack the Timeline record always // contains the info about "the other guy": inet+port for the receiver in // the sender's Timeline, and vice-versa for the receiver's Timeline. if( sr==0 ) b0 = (b0 & ~0xFFFF00) | (h2o._key.udp_port()<<8); tl[idx*WORDS_PER_EVENT+2+1] = b0; tl[idx*WORDS_PER_EVENT+3+1] = b8; } private static void record1( AutoBuffer b, boolean tcp, int sr, int drop) { try { int lim = b._bb.limit(); int pos = b._bb.position(); b._bb.limit(16); long lo = b.get8(0), hi = b.get8(8); final long ns = System.nanoTime(); record2(b._h2o, ns, tcp, sr, drop, lo, hi); b._bb.limit(lim); b._bb.position(pos); } catch(Throwable t) { System.err.println("Timeline record failed, " + t.toString()); Log.err(t); } } static void record_send( AutoBuffer b, boolean tcp) { record1(b,tcp,0, 0); } static void record_recv( AutoBuffer b, boolean tcp, int drop) { record1(b,tcp,1,drop); } // Record a completed I/O event. The nanosecond time slot is actually nano's-blocked-on-io // static void record_IOclose( AutoBuffer b, int flavor ) { // H2ONode h2o = b._h2o==null ? H2O.SELF : b._h2o; // // First long word going out has sender-port and a 'bad' control packet // long b0 = UDP.udp.i_o.ordinal(); // Special flag to indicate io-record and not a rpc-record // b0 |= H2O.SELF._key.udp_port()<<8; // b0 |= flavor<<24; // I/O flavor; one of the Value.persist backends // long iotime = b._time_start_ms > 0 ? (b._time_close_ms - b._time_start_ms) : 0; // b0 |= iotime<<32; // msec from start-to-finish, including non-i/o overheads // long b8 = b._size; // byte's transfered in this I/O // long ns = b._time_io_ns; // nano's blocked doing I/O // record2(h2o,ns,true,b.readMode()?1:0,0,b0,b8); // } /* Record an I/O call without using an AutoBuffer / NIO. * Used by e.g. HDFS & S3 * * @param block_ns - ns of blocking i/o call, * @param io_msg - ms of overall i/o time * @param r_w - 1 for read, 0 for write * @param size - bytes read/written * @param flavor - Value.HDFS or Value.S3 */ // public static void record_IOclose( long start_ns, long start_io_ms, int r_w, long size, int flavor ) { // long block_ns = System.nanoTime() - start_ns; // long io_ms = System.currentTimeMillis() - start_io_ms; // // First long word going out has sender-port and a 'bad' control packet // long b0 = UDP.udp.i_o.ordinal(); // Special flag to indicate io-record and not a rpc-record // b0 |= H2O.SELF._key.udp_port()<<8; // b0 |= flavor<<24; // I/O flavor; one of the Value.persist backends // b0 |= io_ms<<32; // msec from start-to-finish, including non-i/o overheads // record2(H2O.SELF,block_ns,true,r_w,0,b0,size); // } // Accessors, for TimeLines that come from all over the system public static int length( ) { return MAX_EVENTS; } // Internal array math so we can keep layout private private static int idx(long[] tl, int i ) { return (((int)tl[0]+i)&(MAX_EVENTS-1))*WORDS_PER_EVENT+1; } // That first long is complex: compressed CTM and IP4 private static long x0( long[] tl, int idx ) { return tl[idx(tl,idx)]; } // ms since boot of JVM public static long ms( long[] tl, int idx ) { return x0(tl,idx)>>>32; } public static InetAddress inet( long[] tl, int idx ) { int adr = (int)x0(tl,idx); byte[] ip4 = new byte[4]; ip4[0] = (byte)(adr ); ip4[1] = (byte)(adr>> 8); ip4[2] = (byte)(adr>>16); ip4[3] = (byte)(adr>>24); try { return InetAddress.getByAddress(ip4); } catch( UnknownHostException e ) { } return null; } // That 2nd long is nanosec, plus the low bit is send/recv & 2nd low is drop public static long ns( long[] tl, int idx ) { return tl[idx(tl,idx)+1]; } // Returns zero for send, 1 for recv public static int send_recv( long[] tl, int idx ) { return (int)(ns(tl,idx)&1); } // Returns zero for kept, 2 for dropped public static int dropped ( long[] tl, int idx ) { return (int)(ns(tl,idx)&2); } // 16 bytes of payload public static long l0( long[] tl, int idx ) { return tl[idx(tl,idx)+2]; } public static long l8( long[] tl, int idx ) { return tl[idx(tl,idx)+3]; } public static boolean isEmpty( long[] tl, int idx ) { return tl[idx(tl,idx)]==0; } // Take a system-wide snapshot. Return an array, indexed by H2ONode _idx, // containing that Node's snapshot. Try to get all the snapshots as close as // possible to the same point in time. static long[][] SNAPSHOT; static long TIME_LAST_SNAPSHOT = 1; static private H2O CLOUD; // Cloud instance being snapshotted public static H2O getCLOUD(){return CLOUD;} static public long[][] system_snapshot() { // Now spin-wait until we see all snapshots check in. // Be atomic about it. synchronized( TimeLine.class ) { // First see if we have a recent snapshot already. long now = System.currentTimeMillis(); if( now - TIME_LAST_SNAPSHOT < 3*1000 ) return SNAPSHOT; // Use the recent snapshot // A new snapshot is being built? if( TIME_LAST_SNAPSHOT != 0 ) { TIME_LAST_SNAPSHOT = 0; // Only fire off the UDP packet once; flag it // Make a new empty snapshot CLOUD = H2O.CLOUD; SNAPSHOT = new long[CLOUD.size()][]; // Broadcast a UDP packet, with the hopes of getting all SnapShots as close // as possible to the same point in time. new AutoBuffer(H2O.SELF,udp.timeline._prior).putUdp(udp.timeline).close(); } // Spin until all snapshots appear while( true ) { boolean done = true; for( int i=0; i<CLOUD._memary.length; i++ ) if( SNAPSHOT[i] == null ) done = false; if( done ) break; try { TimeLine.class.wait(); } catch( InterruptedException e ) {} } TIME_LAST_SNAPSHOT = System.currentTimeMillis(); return SNAPSHOT; } } // Send our most recent timeline to the remote via TCP @Override AutoBuffer call( AutoBuffer ab ) { long[] a = snapshot(); if( ab._h2o == H2O.SELF ) { synchronized(TimeLine.class) { for( int i=0; i<CLOUD._memary.length; i++ ) if( CLOUD._memary[i]==H2O.SELF ) SNAPSHOT[i] = a; TimeLine.class.notify(); } } else // Send timeline to remote new AutoBuffer(ab._h2o,udp.timeline._prior).putUdp(UDP.udp.timeline).putA8(a).close(); return null; } // Receive a remote timeline static void tcp_call( final AutoBuffer ab ) { ab.getPort(); long[] snap = ab.getA8(); int idx = CLOUD.nidx(ab._h2o); if (idx >= 0 && idx < SNAPSHOT.length) SNAPSHOT[idx] = snap; // Ignore out-of-cloud timelines ab.close(); synchronized (TimeLine.class) { TimeLine.class.notify(); } } String print16( AutoBuffer ab ) { return ""; } // no extra info in a timeline packet /** * Only for debugging. * Prints local timeline to stdout. * * To be used in case of an error when global timeline can not be relied upon as we might not be able to talk to other nodes. */ static void printMyTimeLine(){ long [] s = TimeLine.snapshot(); System.err.println("===================================<TIMELINE>=============================================="); for(int i = 0; i < TimeLine.length(); ++i) { long lo = TimeLine.l0(s, i),hi = TimeLine.l8(s, i); int port = (int)((lo >> 8) & 0xFFFF); String op = TimeLine.send_recv(s,i) == 0?"SEND":"RECV"; if(!TimeLine.isEmpty(s, i) && (lo & 0xFF) == UDP.udp.exec.ordinal()) System.err.println(TimeLine.ms(s, i) + ": " + op + " " + (((TimeLine.ns(s, i) & 4) != 0)?"TCP":"UDP") + TimeLine.inet(s, i) + ":" + port + " | " + UDP.printx16(lo, hi)); } System.err.println("==========================================================================================="); } }