package org.sdnplatform.sync.internal.rpc; import java.net.InetSocketAddress; import java.net.SocketAddress; import java.util.EnumSet; import java.util.HashMap; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.ThreadFactory; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.locks.Condition; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReentrantLock; import net.floodlightcontroller.core.annotations.LogMessageCategory; import net.floodlightcontroller.core.annotations.LogMessageDoc; import net.floodlightcontroller.core.annotations.LogMessageDocs; import net.floodlightcontroller.core.util.SingletonTask; import net.floodlightcontroller.debugcounter.IDebugCounterService; import org.jboss.netty.bootstrap.ClientBootstrap; import org.jboss.netty.bootstrap.ServerBootstrap; import org.jboss.netty.channel.Channel; import org.jboss.netty.channel.ChannelFuture; import org.jboss.netty.channel.ChannelFutureListener; import org.jboss.netty.channel.ChannelPipelineFactory; import org.jboss.netty.channel.group.ChannelGroup; import org.jboss.netty.channel.group.DefaultChannelGroup; import org.jboss.netty.channel.socket.nio.NioClientSocketChannelFactory; import org.jboss.netty.channel.socket.nio.NioServerSocketChannelFactory; import org.jboss.netty.util.internal.LinkedTransferQueue; import org.sdnplatform.sync.internal.SyncManager; import org.sdnplatform.sync.internal.config.Node; import org.sdnplatform.sync.internal.util.Pair; import org.sdnplatform.sync.thrift.SyncMessage; import org.sdnplatform.sync.thrift.MessageType; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * A lightweight RPC mechanism built on netty. * @author readams */ @LogMessageCategory("State Synchronization") public class RPCService { protected static final Logger logger = LoggerFactory.getLogger(RPCService.class); /** * Sync manager associated with this RPC service */ protected SyncManager syncManager; /** * Debug counter service */ protected IDebugCounterService debugCounter; /** * Channel group that will hold all our channels */ final ChannelGroup cg = new DefaultChannelGroup("Internal RPC"); /** * {@link ExecutorService} used for netty boss threads */ protected ExecutorService bossExecutor; /** * {@link ExecutorService} used for netty worker threads */ protected ExecutorService workerExecutor; /** * Netty {@link ClientBootstrap} used for creating client connections */ protected ClientBootstrap clientBootstrap; /** * Netty {@link ServerBootstrap} used for creating server connections */ protected ServerBootstrap serverBootstrap; /** * {@link ChannelPipelineFactory} for creating connections */ protected RPCPipelineFactory pipelineFactory; /** * Node connections */ protected HashMap<Short, NodeConnection> connections = new HashMap<Short, NodeConnection>(); /** * Transaction ID used in message headers in the RPC protocol */ protected AtomicInteger transactionId = new AtomicInteger(); /** * Buffer size for sockets */ public static final int SEND_BUFFER_SIZE = 4 * 1024 * 1024; /** * Connect timeout for client connections */ public static final int CONNECT_TIMEOUT = 500; /** * True after the {@link RPCService#run()} method is called */ protected boolean started = false; /** * true after the {@link RPCService#shutdown()} method * is called. */ protected volatile boolean shutDown = false; /** * Task to periodically ensure that connections are active */ protected SingletonTask reconnectTask; /** * If we want to rate-limit certain types of messages, we can do * so by limiting the overall number of outstanding messages. * The number of such messages will be stored in the * {@link MessageWindow} */ protected ConcurrentHashMap<Short, MessageWindow> messageWindows; protected static final EnumSet<MessageType> windowedTypes = EnumSet.of(MessageType.SYNC_VALUE, MessageType.SYNC_OFFER); /** * A thread pool for handling sync messages. These messages require * a separate pool since writing to the node can be a blocking operation * while waiting for window capacity, and blocking the I/O threads could * lead to deadlock * @see SyncMessageWorker */ protected ExecutorService syncExecutor; /** * A queue for holding sync messages that are awaiting being written * to the channel. * @see SyncMessageWorker */ protected LinkedTransferQueue<NodeMessage> syncQueue = new LinkedTransferQueue<NodeMessage>(); /** * Number of workers in the sync message thread pool */ protected static final int SYNC_MESSAGE_POOL = 2; /** * The maximum number of outstanding pending messages for messages * that use message windows */ protected static final int MAX_PENDING_MESSAGES = 500; public RPCService(SyncManager syncManager, IDebugCounterService debugCounter) { super(); this.syncManager = syncManager; this.debugCounter = debugCounter; messageWindows = new ConcurrentHashMap<Short, MessageWindow>(); } // ************* // public methods // ************* /** * Start the RPC service */ public void run() { started = true; final ThreadGroup tg1 = new ThreadGroup("Sync Message Handlers"); tg1.setMaxPriority(Thread.NORM_PRIORITY - 3); ThreadFactory f1 = new ThreadFactory() { AtomicInteger id = new AtomicInteger(); @Override public Thread newThread(Runnable runnable) { return new Thread(tg1, runnable, "SyncMessage-" + id.getAndIncrement()); } }; syncExecutor = Executors.newCachedThreadPool(f1); for (int i = 0; i < SYNC_MESSAGE_POOL; i++) { syncExecutor.execute(new SyncMessageWorker()); } final ThreadGroup tg2 = new ThreadGroup("Sync I/O Threads"); tg2.setMaxPriority(Thread.NORM_PRIORITY - 1); ThreadFactory f2 = new ThreadFactory() { @Override public Thread newThread(Runnable runnable) { return new Thread(tg2, runnable); } }; bossExecutor = Executors.newCachedThreadPool(f2); workerExecutor = Executors.newCachedThreadPool(f2); pipelineFactory = new RPCPipelineFactory(syncManager, this); startServer(pipelineFactory); startClients(pipelineFactory); } /** * Stop the RPC service */ @LogMessageDocs({ @LogMessageDoc(level="WARN", message="Failed to cleanly shut down RPC server", explanation="Could not close all open sockets cleanly"), @LogMessageDoc(level="WARN", message="Interrupted while shutting down RPC server", explanation="Could not close all open sockets cleanly") }) public void shutdown() { shutDown = true; try { if (!cg.close().await(5, TimeUnit.SECONDS)) { logger.warn("Failed to cleanly shut down RPC server"); return; } if (clientBootstrap != null) clientBootstrap.releaseExternalResources(); clientBootstrap = null; if (serverBootstrap != null) serverBootstrap.releaseExternalResources(); serverBootstrap = null; if (pipelineFactory != null) pipelineFactory.releaseExternalResources(); pipelineFactory = null; if (bossExecutor != null) bossExecutor.shutdown(); bossExecutor = null; if (workerExecutor != null) workerExecutor.shutdown(); workerExecutor = null; } catch (InterruptedException e) { logger.warn("Interrupted while shutting down RPC server"); } logger.debug("Internal floodlight RPC shut down"); } /** * Get a suitable transaction ID for sending a message * @return the unique transaction iD */ public int getTransactionId() { return transactionId.getAndIncrement(); } /** * Write a message to the node specified * @param nodeId the node ID * @param bsm the message to write * @return <code>true</code> if the message was actually written to * the channel. Note this is not the same as having been sent to the * other node. * @throws InterruptedException */ public boolean writeToNode(Short nodeId, SyncMessage bsm) throws InterruptedException { if (nodeId == null) return false; NodeConnection nc = connections.get(nodeId); if (nc != null && nc.state == NodeConnectionState.CONNECTED) { waitForMessageWindow(bsm.getType(), nodeId, 0); nc.nodeChannel.write(bsm); return true; } return false; } /** * Remove the connection from the connection registry and clean up * any remaining shrapnel * @param nodeId */ public void disconnectNode(short nodeId) { synchronized (connections) { Short n = Short.valueOf(nodeId); MessageWindow mw = messageWindows.get(n); if (mw != null) { mw.lock.lock(); mw.disconnected = true; try { mw.full.signalAll(); messageWindows.remove(n); } finally { mw.lock.unlock(); } } NodeConnection nc = connections.get(nodeId); if (nc != null) { nc.nuke(); } connections.remove(nodeId); } } /** * Check whether all links are established * @return */ public boolean isFullyConnected() { for (Node n : syncManager.getClusterConfig().getNodes()) { if (n.getNodeId() != syncManager.getLocalNodeId() && !isConnected(n.getNodeId())) { if (logger.isTraceEnabled()) { logger.trace("[{}->{}] missing connection", syncManager.getLocalNodeId(), n.getNodeId()); } return false; } } return true; } /** * Find out if a particular node is connected * @param nodeId * @return true if the node is connected */ public boolean isConnected(short nodeId) { NodeConnection nc = connections.get(nodeId); return (nc != null && nc.state == NodeConnectionState.CONNECTED); } /** * Called when a message is acknowledged by a remote node * @param type the message type * @param nodeId the remote node */ public void messageAcked(MessageType type, Short nodeId) { if (nodeId == null) return; if (!windowedTypes.contains(type)) return; MessageWindow mw = messageWindows.get(nodeId); if (mw == null) return; int pending = mw.pending.decrementAndGet(); if (pending < MAX_PENDING_MESSAGES) { mw.lock.lock(); try { mw.full.signalAll(); } finally { mw.lock.unlock(); } } } // ************* // Local methods // ************* /** * Get the appropriate {@link MessageWindow} object for the given node. * @param nodeId the remote node * @return a {@link MessageWindow} object */ private MessageWindow getMW(short nodeId) { if (!isConnected(nodeId)) return null; Short n = Short.valueOf(nodeId); MessageWindow mw = messageWindows.get(n); if (mw == null) { mw = new MessageWindow(); MessageWindow old = messageWindows.putIfAbsent(n, mw); if (old != null) mw = old; } return mw; } /** * Wait for a message window slow to be available for the given node and * message type * @param type the type of the message * @param nodeId the node Id * @param maxWait the maximum time to wait in milliseconds * @throws InterruptedException * @return <code>true</code> if the message can be safely written */ private boolean waitForMessageWindow(MessageType type, short nodeId, long maxWait) throws InterruptedException { if (!windowedTypes.contains(type)) return true; long start = System.nanoTime(); // note that this can allow slightly more than the maximum number // of messages. This is fine. MessageWindow mw = getMW(nodeId); if (!mw.disconnected && mw.pending.get() >= MAX_PENDING_MESSAGES) { mw.lock.lock(); try { while (!mw.disconnected && mw.pending.get() >= MAX_PENDING_MESSAGES) { long now = System.nanoTime(); if (maxWait > 0 && (now - start) > maxWait * 1000) return false; mw.full.awaitNanos(now - start); } } finally { mw.lock.unlock(); } } mw = getMW(nodeId); if (mw != null) mw.pending.getAndIncrement(); return true; } /** * Start listening sockets */ @LogMessageDoc(level="INFO", message="Listening for internal floodlight RPC on {port}", explanation="The internal RPC service is ready for connections") protected void startServer(ChannelPipelineFactory pipelineFactory) { final ServerBootstrap bootstrap = new ServerBootstrap( new NioServerSocketChannelFactory(bossExecutor, workerExecutor)); bootstrap.setOption("reuseAddr", true); bootstrap.setOption("child.keepAlive", true); bootstrap.setOption("child.tcpNoDelay", true); bootstrap.setOption("child.sendBufferSize", SEND_BUFFER_SIZE); bootstrap.setOption("child.receiveBufferSize", SEND_BUFFER_SIZE); bootstrap.setPipelineFactory(pipelineFactory); serverBootstrap = bootstrap; int port = syncManager.getClusterConfig().getNode().getPort(); InetSocketAddress sa; String listenAddress = syncManager.getClusterConfig().getListenAddress(); if (listenAddress != null) sa = new InetSocketAddress(listenAddress, port); else sa = new InetSocketAddress(port); cg.add(bootstrap.bind(sa)); logger.info("Listening for internal floodlight RPC on {}", sa); } /** * Wait for the client connection * @author readams */ protected class ConnectCFListener implements ChannelFutureListener { protected Node node; public ConnectCFListener(Node node) { super(); this.node = node; } @Override public void operationComplete(ChannelFuture cf) throws Exception { if (!cf.isSuccess()) { synchronized (connections) { NodeConnection c = connections.remove(node.getNodeId()); if (c != null) c.nuke(); cf.getChannel().close(); } String message = "[unknown error]"; if (cf.isCancelled()) message = "Timed out on connect"; if (cf.getCause() != null) message = cf.getCause().getMessage(); logger.debug("[{}->{}] Could not connect to RPC " + "node: {}", new Object[]{syncManager.getLocalNodeId(), node.getNodeId(), message}); } else { logger.trace("[{}->{}] Channel future successful", syncManager.getLocalNodeId(), node.getNodeId()); } } } /** * Add the node connection to the node connection map * @param nodeId the node ID for the channel * @param channel the new channel */ protected void nodeConnected(short nodeId, Channel channel) { logger.debug("[{}->{}] Connection established", syncManager.getLocalNodeId(), nodeId); synchronized (connections) { NodeConnection c = connections.get(nodeId); if (c == null) { connections.put(nodeId, c = new NodeConnection()); } c.nodeChannel = channel; c.state = NodeConnectionState.CONNECTED; } } /** * Connect to remote servers. We'll initiate the connection to * any nodes with a lower ID so that there will be a single connection * between each pair of nodes which we'll use symmetrically */ protected void startClients(ChannelPipelineFactory pipelineFactory) { final ClientBootstrap bootstrap = new ClientBootstrap( new NioClientSocketChannelFactory(bossExecutor, workerExecutor)); bootstrap.setOption("child.reuseAddr", true); bootstrap.setOption("child.keepAlive", true); bootstrap.setOption("child.tcpNoDelay", true); bootstrap.setOption("child.sendBufferSize", SEND_BUFFER_SIZE); bootstrap.setOption("child.connectTimeoutMillis", CONNECT_TIMEOUT); bootstrap.setPipelineFactory(pipelineFactory); clientBootstrap = bootstrap; ScheduledExecutorService ses = syncManager.getThreadPool().getScheduledExecutor(); reconnectTask = new SingletonTask(ses, new ConnectTask()); reconnectTask.reschedule(0, TimeUnit.SECONDS); } /** * Connect to a remote node if appropriate * @param bootstrap the client bootstrap object * @param n the node to connect to */ protected void doNodeConnect(Node n) { if (!shutDown && n.getNodeId() < syncManager.getLocalNodeId()) { Short nodeId = n.getNodeId(); synchronized (connections) { NodeConnection c = connections.get(n.getNodeId()); if (c == null) { connections.put(nodeId, c = new NodeConnection()); } if (logger.isTraceEnabled()) { logger.trace("[{}->{}] Connection state: {}", new Object[]{syncManager.getLocalNodeId(), nodeId, c.state}); } if (c.state.equals(NodeConnectionState.NONE)) { if (logger.isDebugEnabled()) { logger.debug("[{}->{}] Attempting connection {} {}", new Object[]{syncManager.getLocalNodeId(), nodeId, n.getHostname(), n.getPort()}); } SocketAddress sa = new InetSocketAddress(n.getHostname(), n.getPort()); c.pendingFuture = clientBootstrap.connect(sa); c.pendingFuture.addListener(new ConnectCFListener(n)); c.state = NodeConnectionState.PENDING; } } } } /** * Ensure that all client connections are active */ protected void startClientConnections() { for (Node n : syncManager.getClusterConfig().getNodes()) { doNodeConnect(n); } } /** * Periodically ensure that all the node connections are alive * @author readams */ protected class ConnectTask implements Runnable { @Override public void run() { try { if (!shutDown) startClientConnections(); } catch (Exception e) { logger.error("Error in reconnect task", e); } if (!shutDown) { reconnectTask.reschedule(500, TimeUnit.MILLISECONDS); } } } /** * Various states for connections * @author readams */ protected enum NodeConnectionState { NONE, PENDING, CONNECTED } /** * Connection state wrapper for node connections * @author readams */ protected static class NodeConnection { volatile NodeConnectionState state = NodeConnectionState.NONE; protected ChannelFuture pendingFuture; protected Channel nodeChannel; protected void nuke() { state = NodeConnectionState.NONE; if (pendingFuture != null) pendingFuture.cancel(); if (nodeChannel != null) nodeChannel.close(); pendingFuture = null; nodeChannel = null; } } /** * Maintain state for the pending message window for a given message type * @author readams */ protected static class MessageWindow { AtomicInteger pending = new AtomicInteger(); volatile boolean disconnected = false; Lock lock = new ReentrantLock(); Condition full = lock.newCondition(); } /** * A pending message to be sent to a particular mode. * @author readams */ protected static class NodeMessage extends Pair<Short,SyncMessage> { private static final long serialVersionUID = -3443080461324647922L; public NodeMessage(Short first, SyncMessage second) { super(first, second); } } /** * A worker thread responsible for reading sync messages off the queue * and writing them to the appropriate node's channel. Because calls * {@link RPCService#writeToNode(Short, SyncMessage)} can block while * waiting for available slots in the message window, we do this in a * separate thread. * @author readams */ protected class SyncMessageWorker implements Runnable { @Override public void run() { while (true) { try { NodeMessage m = syncQueue.take(); writeToNode(m.getFirst(), m.getSecond()); } catch (Exception e) { logger.error("Error while dispatching message", e); } } } } }