H2ONode.java example

Explorer
h2o-3-master
package water;

import water.RPC.RPCCall;
import water.nbhm.NonBlockingHashMap;
import water.nbhm.NonBlockingHashMapLong;
import water.network.SocketChannelFactory;
import water.util.ArrayUtils;
import water.util.Log;
import water.util.MathUtils;
import water.util.UnsafeUtils;

import java.io.IOException;
import java.net.*;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.channels.ByteChannel;
import java.nio.channels.DatagramChannel;
import java.nio.channels.SocketChannel;
import java.util.*;
import java.util.concurrent.PriorityBlockingQueue;
import java.util.concurrent.atomic.AtomicInteger;

/**
 * A <code>Node</code> in an <code>H2O</code> Cloud.
 * Basically a worker-bee with CPUs, Memory and Disk.
 * One of this is the self-Node, but the rest are remote Nodes.
 *
 * @author <a href="mailto:cliffc@h2o.ai"></a>
 * @version 1.0
 */

public final class H2ONode extends Iced<H2ONode> implements Comparable {
  transient private SocketChannelFactory _socketFactory;
  transient private H2OSecurityManager _security;

  transient short _unique_idx; // Dense integer index, skipping 0.  NOT cloud-wide unique.
  transient boolean _announcedLostContact;  // True if heartbeat published a no-contact msg
  transient public long _last_heard_from; // Time in msec since we last heard from this Node
  transient public volatile HeartBeat _heartbeat;  // My health info.  Changes 1/sec.
  transient public int _tcp_readers;               // Count of started TCP reader threads

  public boolean _removed_from_cloud;
  public void stopSendThread(){
    if(_sendThread != null) {
      _sendThread._stopRequested = true;
      _sendThread = null;
    }
    _removed_from_cloud = true;
  }
  // A JVM is uniquely named by machine IP address and port#
  public final H2Okey _key;

  /** Identification of the node via IP and PORT.
   *
   */
  static final class H2Okey extends InetSocketAddress implements Comparable {
    // Numeric representation of IP
    // For IPv6 the both fields are valid and describes full IPv6 address, for IPv4 only low 32 bits of _ipLow are valid
    // But still need a flag to distinguish between IPv4 and IPv6
    final long _ipHigh, _ipLow; // IPv4: A.B.C.D ~ DCBA
    H2Okey(InetAddress inet, int port) {
      super(inet, port);
      byte[] b = inet.getAddress(); // 4bytes or 16bytes
      if (b.length == 4) {
        assert !H2O.IS_IPV6 : "IPv4 stack specified but IPv6 address passed! " + inet;
        _ipHigh = 0;
        _ipLow = ArrayUtils.encodeAsInt(b) & 0XFFFFFFFFL;
      } else {
        assert H2O.IS_IPV6 : "IPv6 stack specified but IPv4 address passed! " + inet;
        _ipHigh = ArrayUtils.encodeAsLong(b, 8, 8);
        _ipLow = ArrayUtils.encodeAsLong(b, 0, 8);
      }
    }
    public int htm_port() { return getPort()-1; }
    public int udp_port() { return getPort()  ; }
    @Override public String toString() { return getAddress()+":"+htm_port(); }
    public String getIpPortString() {
      return getAddress().getHostAddress() + ":" + htm_port();
    }
    AutoBuffer write( AutoBuffer ab ) {
      return (!H2O.IS_IPV6
              ? ab.put4((int) _ipLow)
              : ab.put8(_ipLow).put8(_ipHigh)).put2((char) udp_port());
    }
    static H2Okey read( AutoBuffer ab ) {
      try {
        InetAddress inet = InetAddress.getByAddress(ab.getA1(SIZE_OF_IP));
        int port = ab.get2();
        return new H2Okey(inet, port);
      } catch( UnknownHostException e ) { throw Log.throwErr(e); }
    }
    // Canonical ordering based on inet & port
    @Override public int compareTo(Object x) {
      if( x == null ) return -1;   // Always before null
      if( x == this ) return 0;
      H2Okey key = (H2Okey)x;
      // Must be unsigned long-arithmetic, or overflow will make a broken sort
      int res = MathUtils.compareUnsigned(_ipHigh, _ipLow, key._ipHigh, key._ipLow);
      return res != 0 ? res : udp_port() - key.udp_port();
    }

    static int SIZE_OF_IP = H2O.IS_IPV6 ? 16 : 4;
    /** Size of serialized H2OKey */
    static int SIZE = SIZE_OF_IP /* ip */ + 2 /* port */;
  }

  public String getIpPortString() {
    return _key.getIpPortString();
  }

  public final int ip4() { return (int) _key._ipLow; }

  // These are INTERN'd upon construction, and are uniquely numbered within the
  // same run of a JVM.  If a remote Node goes down, then back up... it will
  // come back with the SAME IP address, and the same unique_idx and history
  // relative to *this* Node.  They can be compared with pointer-equality.  The
  // unique idx is used to know which remote Nodes have cached which Keys, even
  // if the Home#/Replica# change for a Key due to an unrelated change in Cloud
  // membership.  The unique_idx is *per Node*; not all Nodes agree on the same
  // indexes.
  private H2ONode( H2Okey key, short unique_idx ) {
    _key = key;
    _unique_idx = unique_idx;
    _last_heard_from = System.currentTimeMillis();
    _heartbeat = new HeartBeat();

    _security = H2OSecurityManager.instance();
    _socketFactory = SocketChannelFactory.instance(_security);
  }

  public boolean isHealthy() { return isHealthy(System.currentTimeMillis()); }
  public boolean isHealthy(long now) {
    return (now - _last_heard_from) < HeartBeatThread.TIMEOUT;
  }

  // ---------------
  // A dense integer index for every unique IP ever seen, since the JVM booted.
  // Used to track "known replicas" per-key across Cloud change-ups.  Just use
  // an array-of-H2ONodes
  static private final NonBlockingHashMap<H2Okey,H2ONode> INTERN = new NonBlockingHashMap<>();
  static private final AtomicInteger UNIQUE = new AtomicInteger(1);
  static H2ONode IDX[] = new H2ONode[1];

  // Create and/or re-use an H2ONode.  Each gets a unique dense index, and is
  // *interned*: there is only one per InetAddress.
  static private H2ONode intern( H2Okey key ) {
    H2ONode h2o = INTERN.get(key);
    if( h2o != null ) return h2o;
    final int idx = UNIQUE.getAndIncrement();
    assert idx < Short.MAX_VALUE;
    h2o = new H2ONode(key,(short)idx);
    H2ONode old = INTERN.putIfAbsent(key,h2o);
    if( old != null ) return old;
    synchronized(H2O.class) {
      while( idx >= IDX.length )
        IDX = Arrays.copyOf(IDX,IDX.length<<1);
      IDX[idx] = h2o;
    }
    h2o._sendThread = h2o.new UDP_TCP_SendThread(); // Launch the UDP send thread
    h2o._sendThread.start();
    return h2o;
  }
  public static H2ONode intern( InetAddress ip, int port ) { return intern(new H2Okey(ip,port)); }

  public static H2ONode intern( byte[] bs, int off ) {
    byte[] b = new byte[H2Okey.SIZE_OF_IP]; // the size depends on version of selected IP stack
    int port;
    // The static constant should be optimized
    if (!H2O.IS_IPV6) { // IPv4
      UnsafeUtils.set4(b, 0, UnsafeUtils.get4(bs, off));
    } else { // IPv6
      UnsafeUtils.set8(b, 0, UnsafeUtils.get8(bs, off));
      UnsafeUtils.set8(b, 8, UnsafeUtils.get8(bs, off + 8));
    }
    port = UnsafeUtils.get2(bs,off + H2Okey.SIZE_OF_IP) & 0xFFFF;
    try { return intern(InetAddress.getByAddress(b),port); } 
    catch( UnknownHostException e ) { throw Log.throwErr(e); }
  }

  // Get a nice Node Name for this Node in the Cloud.  Basically it's the
  // InetAddress we use to communicate to this Node.
  public static H2ONode self(InetAddress local) {
    assert H2O.H2O_PORT != 0;
    try {
      // Figure out which interface matches our IP address
      List<NetworkInterface> matchingIfs = new ArrayList<>();
      Enumeration<NetworkInterface> netIfs = NetworkInterface.getNetworkInterfaces();
      while( netIfs.hasMoreElements() ) {
        NetworkInterface netIf = netIfs.nextElement();
        Enumeration<InetAddress> addrs = netIf.getInetAddresses();
        while( addrs.hasMoreElements() ) {
          InetAddress addr = addrs.nextElement();
          if( addr.equals(local) ) {
            matchingIfs.add(netIf);
            break;
          }
        }
      }
      switch( matchingIfs.size() ) {
      case 0: H2O.CLOUD_MULTICAST_IF = null; break;
      case 1: H2O.CLOUD_MULTICAST_IF = matchingIfs.get(0); break;
      default:
        String msg = "Found multiple network interfaces for ip address " + local;
        for( NetworkInterface ni : matchingIfs ) {
          msg +="\n\t" + ni;
        }
        msg +="\nUsing " + matchingIfs.get(0) + " for UDP broadcast";
        Log.warn(msg);
        H2O.CLOUD_MULTICAST_IF = matchingIfs.get(0);
      }
    } catch( SocketException e ) {
      throw Log.throwErr(e);
    }

    // Selected multicast interface must support multicast, and be up and running!
    try {
      if( H2O.CLOUD_MULTICAST_IF != null && !H2O.CLOUD_MULTICAST_IF.supportsMulticast() ) {
        Log.info("Selected H2O.CLOUD_MULTICAST_IF: "+H2O.CLOUD_MULTICAST_IF+ " doesn't support multicast");
//        H2O.CLOUD_MULTICAST_IF = null;
      } 
      if( H2O.CLOUD_MULTICAST_IF != null && !H2O.CLOUD_MULTICAST_IF.isUp() ) {
        throw new RuntimeException("Selected H2O.CLOUD_MULTICAST_IF: "+H2O.CLOUD_MULTICAST_IF+ " is not up and running");
      }
    } catch( SocketException e ) {
      throw Log.throwErr(e);
    }

    try {
      assert water.init.NetworkInit.CLOUD_DGRAM == null;
      water.init.NetworkInit.CLOUD_DGRAM = DatagramChannel.open();
    } catch( Exception e ) {
      throw Log.throwErr(e);
    }
    return intern(new H2Okey(local,H2O.H2O_PORT));
  }

  // Happy printable string
  @Override public String toString() { return _key.toString (); }
  @Override public int hashCode() { return _key.hashCode(); }
  @Override public boolean equals(Object o) { return _key.equals   (((H2ONode)o)._key); }
  @Override public int compareTo( Object o) { return _key.compareTo(((H2ONode)o)._key); }

  // index of this node in the current cloud... can change at the next cloud.
  public int index() { return H2O.CLOUD.nidx(this); }

  // ---------------
  // A queue of available TCP sockets
  // re-usable TCP socket opened to this node, or null.
  // This is essentially a BlockingQueue/Stack that allows null.
  private transient ByteChannel _socks[] = new ByteChannel[2];
  private transient int _socksAvail=_socks.length;
  // Count of concurrent TCP requests both incoming and outgoing
  static final AtomicInteger TCPS = new AtomicInteger(0);

  ByteChannel getTCPSocket() throws IOException {
    // Under lock, claim an existing open socket if possible
    synchronized(this) {
      // Limit myself to the number of open sockets from node-to-node
      while( _socksAvail == 0 )
        try { wait(1000); } catch( InterruptedException ignored ) { }
      // Claim an open socket
      ByteChannel sock = _socks[--_socksAvail];
      if( sock != null ) {
        if( sock.isOpen() ) return sock; // Return existing socket!
        // Else it's an already-closed socket, lower open TCP count
        assert TCPS.get() > 0;
        TCPS.decrementAndGet();
      }
    }
    // Must make a fresh socket
    SocketChannel sock2 = SocketChannel.open();
    sock2.socket().setReuseAddress(true);
    sock2.socket().setSendBufferSize(AutoBuffer.BBP_BIG._size);
    boolean res = sock2.connect( _key );
    assert res && !sock2.isConnectionPending() && sock2.isBlocking() && sock2.isConnected() && sock2.isOpen();
    ByteBuffer bb = ByteBuffer.allocate(4).order(ByteOrder.nativeOrder());
    bb.put((byte)2);
    bb.putChar((char)H2O.H2O_PORT);
    bb.put((byte)0xef);
    bb.flip();
    ByteChannel wrappedSocket = _socketFactory.clientChannel(sock2, _key.getHostName(), _key.getPort());
    while(bb.hasRemaining()) {
      wrappedSocket.write(bb);
    }
    TCPS.incrementAndGet();     // Cluster-wide counting
    return wrappedSocket;
  }
  synchronized void freeTCPSocket( ByteChannel sock ) {
    assert 0 <= _socksAvail && _socksAvail < _socks.length;
    assert TCPS.get() > 0;
    if( sock != null && !sock.isOpen() ) sock = null;
    _socks[_socksAvail++] = sock;
    if( sock == null ) TCPS.decrementAndGet();
    notify();
  }

  // ---------------
  // Send UDP via batched TCP.  Note: has to happen out-of-band with the
  // standard AutoBuffer writing, which can hit the case of needing a TypeId
  // mapping mid-serialization.  Thus this path uses another TCP channel that
  // is specifically not any of the above channels.  This channel is limited to
  // messages which are presented in their entirety (not streamed) thus never
  // need another (nested) TCP channel.
  private transient UDP_TCP_SendThread _sendThread = null; // set notnull if properly interned, and done before first sendMessage
  public void sendMessage( ByteBuffer bb, byte msg_priority ) { _sendThread.sendMessage(bb,msg_priority); }

  /**
   * Returns a new connection of type {@code tcpType}, the type can be either
   *   TCPReceiverThread.TCP_SMALL, TCPReceiverThread.TCP_BIG or
   *   TCPReceiverThread.TCP_EXTERNAL.
   *
   * If socket channel factory is set, the communication will considered to be secured - this depends on the
   * configuration of the {@link SocketChannelFactory}. In case of the factory is null, the communication won't be secured.
   * @return new socket channel
   */
  public static ByteChannel openChan(byte tcpType, SocketChannelFactory socketFactory, InetAddress originAddr, int originPort ) throws IOException {
    // Must make a fresh socket
    SocketChannel sock = SocketChannel.open();
    sock.socket().setReuseAddress(true);
    sock.socket().setSendBufferSize(AutoBuffer.BBP_BIG._size);
    InetSocketAddress isa = new InetSocketAddress(originAddr, originPort);
    boolean res = sock.connect(isa); // Can toss IOEx, esp if other node is still booting up
    assert res : "Should be already connected, but connection is in non-blocking mode and the connection operation is in progress!";
    sock.configureBlocking(true);
    assert !sock.isConnectionPending() && sock.isBlocking() && sock.isConnected() && sock.isOpen();
    sock.socket().setTcpNoDelay(true);
    ByteBuffer bb = ByteBuffer.allocate(4).order(ByteOrder.nativeOrder());
    bb.put(tcpType).putChar((char) H2O.H2O_PORT).put((byte) 0xef).flip();

    ByteChannel wrappedSocket = socketFactory.clientChannel(sock, isa.getHostName(), isa.getPort());

    while (bb.hasRemaining()) {  // Write out magic startup sequence
      wrappedSocket.write(bb);
    }
    return wrappedSocket;
  }

  public static ByteChannel openChan(byte tcpType, SocketChannelFactory socketFactory, String originAddr, int originPort) throws IOException {
    return openChan(tcpType, socketFactory, InetAddress.getByName(originAddr), originPort);
  }

  // Private thread serving (actually ships the bytes over) small msg Q.
  // Buffers the small messages together and sends the bytes over via TCP channel.
  class UDP_TCP_SendThread extends Thread {

    volatile boolean _stopRequested;
    private ByteChannel _chan;  // Lazily made on demand; closed & reopened on error
    private final ByteBuffer _bb; // Reusable output large buffer
  
    public UDP_TCP_SendThread(){
      super("UDP-TCP-SEND-" + H2ONode.this);
      _bb = AutoBuffer.BBP_BIG.make();
    }
  
    /** Send small message to this node.  Passes the message on to a private msg
     *  q, prioritized by the message priority.  MSG queue is served by sender
     *  thread, message are continuously extracted, buffered together and sent
     *  over TCP channel.
     *  @param bb Message to send
     *  @param msg_priority priority (e.g. NACK and ACKACK beat most other priorities
     */
    public void sendMessage(ByteBuffer bb, byte msg_priority) {
      assert bb.position()==0 && bb.limit() > 0;
      // Secret back-channel priority: the position field (capped at bb.limit);
      // this is to avoid making Yet Another Object per send.
  
      // Priority can exceed position.  "interesting" priorities are everything
      // above H2O.MIN_HI_PRIORITY and things just above 0; priorities in the
      // middl'n range from 10 to MIN_HI are really rare.  Need to compress
      // priorities a little for this hack to work.
      if( msg_priority >= H2O.MIN_HI_PRIORITY ) msg_priority = (byte)((msg_priority-H2O.MIN_HI_PRIORITY)+10);
      else if( msg_priority >= 10 ) msg_priority = 10;
      if( msg_priority > bb.limit() ) msg_priority = (byte)bb.limit();
      bb.position(msg_priority);
  
      _msgQ.put(bb); 
    }
  
    private final PriorityBlockingQueue<ByteBuffer> _msgQ
      = new PriorityBlockingQueue<>(11,new Comparator<ByteBuffer>() {
          // Secret back-channel priority: the position field (capped at bb.limit)
          @Override public int compare( ByteBuffer bb1, ByteBuffer bb2 ) { return bb1.position() - bb2.position(); }
        });
  
    @Override public void run(){
      try {
        while (!_stopRequested) {            // Forever loop
          try {
            ByteBuffer bb = _msgQ.take(); // take never returns null but blocks instead
            while( bb != null ) {         // while have an BB to process
              assert !bb.isDirect() : "Direct BBs already got recycled";
              assert bb.limit()+1+2 <= _bb.capacity() : "Small message larger than the output buffer";
              if( _bb.remaining() < bb.limit()+1+2 )
                sendBuffer();     // Send full batch; reset _bb so taken bb fits
              _bb.putChar((char)bb.limit());
              _bb.put(bb.array(),0,bb.limit()); // Jam this BB into the existing batch BB, all in one go (it all fits)
              _bb.put((byte)0xef);// Sentinel byte
              bb = _msgQ.poll();  // Go get more, same batch
            }
            sendBuffer();         // Send final trailing BBs
          } catch (IllegalMonitorStateException imse) { /* ignore */
          } catch (InterruptedException e) { /*ignore*/ }
        }
      } catch(Throwable t) { throw Log.throwErr(t); }
      if(_chan != null) {
        try {_chan.close();} catch (IOException e) {}
        _chan = null;
      }
    }
  
    void sendBuffer(){
      int retries = 0;
      _bb.flip();                 // limit set to old position; position set to 0
      while( !_stopRequested && _bb.hasRemaining()) {
        try {
          ByteChannel chan = _chan == null ? (_chan=openChan()) : _chan;
          chan.write(_bb);
        } catch(IOException ioe) {
          _bb.rewind();           // Position to zero; limit unchanged; retry the operation
          // Log if not shutting down, and not middle-of-cloud-formation where
          // other node is still booting up (expected common failure), or *never*
          // comes up - such as when not all nodes mentioned in a flatfile will be
          // booted.  Basically the ERRR log will show endless repeat attempts to
          // connect to the missing node
          if( !_stopRequested && !H2O.getShutdownRequested() && (Paxos._cloudLocked || retries++ > 300) ) {
            Log.err("Got IO error when sending batch UDP bytes: ",ioe);
            retries = 150;      // Throttle the pace of error msgs
          }
          if( _chan != null )
            try { _chan.close(); } catch (Throwable t) {/*ignored*/}
          _chan = null;
          retries++;
          final int sleep = Math.min(5000,retries << 1);
          try {Thread.sleep(sleep);} catch (InterruptedException e) {/*ignored*/}
        }
      }
      _bb.clear();            // Position set to 0; limit to capacity
    }
  
    // Open channel on first write attempt
    private ByteChannel openChan() throws IOException {
      return H2ONode.openChan(TCPReceiverThread.TCP_SMALL, _socketFactory, _key.getAddress(), _key.getPort());
    }
  }

  // ---------------
  // The *outgoing* client-side calls; pending tasks this Node wants answered.
  private final NonBlockingHashMapLong<RPC> _tasks = new NonBlockingHashMapLong<>();
  void taskPut(int tnum, RPC rpc ) { 
    _tasks.put(tnum,rpc); 
    if( rpc._dt instanceof TaskPutKey ) _tasksPutKey.put(tnum,(TaskPutKey)rpc._dt);
  }
  RPC taskGet(int tnum) { return _tasks.get(tnum); }
  void taskRemove(int tnum) { 
    _tasks.remove(tnum); 
    _tasksPutKey.remove(tnum);
  }
  Collection<RPC> tasks() { return _tasks.values(); }
  int taskSize() { return _tasks.size(); }

  // True if there is a pending PutKey against this Key.  Totally a speed
  // optimization in the case of a large number of pending Gets are flooding
  // the tasks() queue, each needing to scan the tasks queue for pending
  // PutKeys to the same Key.  Legal to always 
  private final NonBlockingHashMapLong<TaskPutKey> _tasksPutKey = new NonBlockingHashMapLong<>();
  TaskPutKey pendingPutKey( Key k ) {
    for( TaskPutKey tpk : _tasksPutKey.values() )
      if( k.equals(tpk._key) )
        return tpk;
    return null;
  }

  // The next unique task# sent *TO* the 'this' Node.
  private final AtomicInteger _created_task_ids = new AtomicInteger(1);
  int nextTaskNum() { return _created_task_ids.getAndIncrement(); }


  // ---------------
  // The Work-In-Progress list.  Each item is a UDP packet's worth of work.
  // When the RPCCall to _computed, then it's Completed work instead
  // work-in-progress.  Completed work can be short-circuit replied-to by
  // resending the RPC._dt back.  Work that we're sure the this Node has seen
  // the reply to can be removed - but we must remember task-completion for all
  // time (because UDP packets can be dup'd and arrive very very late and
  // should not be confused with new work).
  private final NonBlockingHashMapLong<RPC.RPCCall> _work = new NonBlockingHashMapLong<>();

  // We must track even dead/completed tasks for All Time (lest a very very
  // delayed UDP packet look like New Work).  The easy way to do this is leave
  // all work packets/RPCs in the _work HashMap for All Time - but this amounts
  // to a leak.  Instead we "roll up" the eldest completed work items, just
  // remembering their completion status.  Task id's older (smaller) than the
  // _removed_task_ids are both completed, and rolled-up to a single integer.
  private final AtomicInteger _removed_task_ids = new AtomicInteger(0);
  // A Golden Completed Task: it's a shared completed task used to represent
  // all instances of tasks that have been completed and are no longer being
  // tracked separately.
  private final RPC.RPCCall _removed_task = new RPC.RPCCall(this);

  RPC.RPCCall has_task( int tnum ) {
    if( tnum <= _removed_task_ids.get() ) return _removed_task;
    return _work.get(tnum);
  }

  // Record a task-in-progress, or return the prior RPC if one already exists.
  // The RPC will flip to "_completed" once the work is done.  The RPC._dtask
  // can be repeatedly ACKd back to the caller, and the _dtask is removed once
  // an ACKACK appears - and the RPC itself is removed once all prior RPCs are
  // also ACKACK'd.
  RPC.RPCCall record_task( RPC.RPCCall rpc ) {
    // Task removal (and roll-up) suffers from classic race-condition, which we
    // fix by a classic Dekker's algo; a task# is always in either the _work
    // HashMap, or rolled-up in the _removed_task_ids counter, or both (for
    // short intervals during the handoff).  We can never has a cycle where
    // it's in neither or else a late UDP may attempt to "resurrect" the
    // already completed task.  Hence we must always check the "removed ids"
    // AFTER we insert in the HashMap (we can check before also, but that's a
    // simple optimization and not sufficient for correctness).
    final RPC.RPCCall x = _work.putIfAbsent(rpc._tsknum,rpc);
    if( x != null ) return x;   // Return pre-existing work
    // If this RPC task# is very old, we just return a Golden Completed task.
    // The task is not just completed, but also we have already received
    // verification that the client got the answer.  So this is just a really
    // old attempt to restart a long-completed task.
    if( rpc._tsknum > _removed_task_ids.get() ) return null; // Task is new
    _work.remove(rpc._tsknum); // Bogus insert, need to remove it
    return _removed_task;      // And return a generic Golden Completed object
  }
  // Record the final return value for a DTask.  Should happen only once.
  // Recorded here, so if the client misses our ACK response we can resend the
  // same answer back.
  void record_task_answer( RPC.RPCCall rpcall ) {
//    assert rpcall._started == 0 || rpcall._dt.hasException();
    rpcall._started = System.currentTimeMillis();
    rpcall._retry = RPC.RETRY_MS; // Start the timer on when to resend
//    AckAckTimeOutThread.PENDING.add(rpcall);
  }
  // Stop tracking a remote task, because we got an ACKACK.
  void remove_task_tracking( int task ) {
    RPC.RPCCall rpc = _work.get(task);
    if( rpc == null ) return;   // Already stopped tracking

    // Atomically attempt to remove the 'dt'.  If we win, we are the sole
    // thread running the dt.onAckAck.  Also helps GC: the 'dt' is done (sent
    // to client and we received the ACKACK), but the rpc might need to stick
    // around a long time - and the dt might be big.
    DTask dt = rpc._dt;         // The existing DTask, if any
    if( dt != null && rpc.CAS_DT(dt,null) ) {
      assert rpc._computed : "Still not done #"+task+" "+dt.getClass()+" from "+rpc._client;
      dt.onAckAck();            // One-time call on stop-tracking
    }
    // Roll-up as many done RPCs as we can, into the _removed_task_ids list
    while( true ) {
      int t = _removed_task_ids.get();   // Last already-removed ID
      RPC.RPCCall rpc2 = _work.get(t+1); // RPC of 1st not-removed ID
      if( rpc2 == null || rpc2._dt != null || !_removed_task_ids.compareAndSet(t,t+1) )
        break;                  // Stop when we hit in-progress tasks
      _work.remove(t+1);        // Else we can remove the tracking now
    }
  }

  // Resend ACK's, in case the UDP ACKACK got dropped.  Note that even if the
  // ACK was sent via TCP, the ACKACK might be dropped.  Further: even if we
  // *know* the client got our TCP response, we do not know *when* he'll
  // process it... so we cannot e.g. eagerly do an ACKACK on this side.  We
  // must wait for the real ACKACK - which can drop.  So we *must* resend ACK's
  // occasionally to force a resend of ACKACKs.

  static class AckAckTimeOutThread extends Thread {
    AckAckTimeOutThread() { super("ACKTimeout"); }
    // List of DTasks with results ready (and sent!), and awaiting an ACKACK.
    // Started by main() on a single thread, handle timing-out UDP packets
    @Override public void run() {
      Thread.currentThread().setPriority(Thread.MAX_PRIORITY-1);
      while( true ) {
        long currenTime = System.currentTimeMillis();
        for(H2ONode h2o:H2O.CLOUD._memary) {
          if(h2o != H2O.SELF) {
            for(RPCCall rpc:h2o._work.values()) {
              if((rpc._started + rpc._retry) < currenTime) {
                // RPC from somebody who dropped out of cloud?
                if( (!H2O.CLOUD.contains(rpc._client) && !rpc._client._heartbeat._client) ||
                  // Timedout client?
                  (rpc._client._heartbeat._client && rpc._retry >= HeartBeatThread.CLIENT_TIMEOUT) ) {
                  rpc._client.remove_task_tracking(rpc._tsknum);
                } else  {
                  if (rpc._computed) {
                    if (rpc._computedAndReplied) {
                      DTask dt = rpc._dt;
                      if(dt != null) {
                        if (++rpc._ackResendCnt % 5 == 0)
                          Log.warn("Got " + rpc._ackResendCnt + " resends on ack for task # " + rpc._tsknum + ", class = " + dt.getClass().getSimpleName());
                        rpc.resend_ack();
                      }
                    }
                  } else if(rpc._nackResendCnt == 0) { // else send nack
                    ++rpc._nackResendCnt;
                    rpc.send_nack();
                  }
                }
              }
            }
          }
        }
        long timeElapsed = System.currentTimeMillis()-currenTime;
        if(timeElapsed < 1000)
          try {Thread.sleep(1000-timeElapsed);} catch (InterruptedException e) {/*comment to stop ideaj warning*/}
      }
    }
  }

  // This Node rebooted recently; we can quit tracking prior work history
  void rebooted() {
    _work.clear();
    _removed_task_ids.set(0);
  }

  // Custom Serialization Class: H2OKey need to be built.
  public final AutoBuffer write_impl(AutoBuffer ab) { return _key.write(ab); }
  public final H2ONode read_impl( AutoBuffer ab ) { return intern(H2Okey.read(ab)); }
  public final AutoBuffer writeJSON_impl(AutoBuffer ab) { return ab.putJSONStr("node",_key.toString()); }
  public final H2ONode readJSON_impl( AutoBuffer ab ) { throw H2O.fail(); }


  public SocketChannelFactory getSocketFactory() {
    return _socketFactory;
  }

  public H2OSecurityManager getSecurityManager() {
    return _security;
  }
}