/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.zookeeper.server.quorum; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.IOException; import java.net.InetSocketAddress; import java.net.ServerSocket; import java.net.Socket; import java.net.SocketException; import java.nio.BufferUnderflowException; import java.nio.ByteBuffer; import java.nio.channels.UnresolvedAddressException; import java.util.Enumeration; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.TimeUnit; import java.util.NoSuchElementException; import java.util.concurrent.atomic.AtomicInteger; import java.util.Date; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * This class implements a connection manager for leader election using TCP. It * maintains one connection for every pair of servers. The tricky part is to * guarantee that there is exactly one connection for every pair of servers that * are operating correctly and that can communicate over the network. * * If two servers try to start a connection concurrently, then the connection * manager uses a very simple tie-breaking mechanism to decide which connection * to drop based on the IP addressed of the two parties. * * For every peer, the manager maintains a queue of messages to send. If the * connection to any particular peer drops, then the sender thread puts the * message back on the list. As this implementation currently uses a queue * implementation to maintain messages to send to another peer, we add the * message to the tail of the queue, thus changing the order of messages. * Although this is not a problem for the leader election, it could be a problem * when consolidating peer communication. This is to be verified, though. * */ public class QuorumCnxManager { private static final Logger LOG = LoggerFactory.getLogger(QuorumCnxManager.class); /* * Maximum capacity of thread queues */ static final int RECV_CAPACITY = 100; // Initialized to 1 to prevent sending // stale notifications to peers static final int SEND_CAPACITY = 1; static final int PACKETMAXSIZE = 1024 * 1024; /* * Maximum number of attempts to connect to a peer */ static final int MAX_CONNECTION_ATTEMPTS = 2; /* * Negative counter for observer server ids. */ private long observerCounter = -1; /* * Connection time out value in milliseconds */ private int cnxTO = 5000; /* * Local IP address */ final QuorumPeer self; /* * Mapping from Peer to Thread number */ final ConcurrentHashMap<Long, SendWorker> senderWorkerMap; final ConcurrentHashMap<Long, ArrayBlockingQueue<ByteBuffer>> queueSendMap; final ConcurrentHashMap<Long, ByteBuffer> lastMessageSent; /* * Reception queue */ public final ArrayBlockingQueue<Message> recvQueue; /* * Object to synchronize access to recvQueue */ private final Object recvQLock = new Object(); /* * Shutdown flag */ boolean shutdown = false; /* * Listener thread */ public final Listener listener; /* * Counter to count worker threads */ private AtomicInteger threadCnt = new AtomicInteger(0); static public class Message { Message(ByteBuffer buffer, long sid) { this.buffer = buffer; this.sid = sid; } ByteBuffer buffer; long sid; } public QuorumCnxManager(QuorumPeer self) { this.recvQueue = new ArrayBlockingQueue<Message>(RECV_CAPACITY); this.queueSendMap = new ConcurrentHashMap<Long, ArrayBlockingQueue<ByteBuffer>>(); this.senderWorkerMap = new ConcurrentHashMap<Long, SendWorker>(); this.lastMessageSent = new ConcurrentHashMap<Long, ByteBuffer>(); String cnxToValue = System.getProperty("zookeeper.cnxTimeout"); if(cnxToValue != null){ this.cnxTO = new Integer(cnxToValue); } this.self = self; // Starts listener thread that waits for connection requests listener = new Listener(); } /** * Invokes initiateConnection for testing purposes * * @param sid */ public void testInitiateConnection(long sid) throws Exception { if (LOG.isDebugEnabled()) { LOG.debug("Opening channel to server " + sid); } Socket sock = new Socket(); setSockOpts(sock); sock.connect(self.getVotingView().get(sid).electionAddr, cnxTO); initiateConnection(sock, sid); } /** * If this server has initiated the connection, then it gives up on the * connection if it loses challenge. Otherwise, it keeps the connection. */ public boolean initiateConnection(Socket sock, Long sid) { DataOutputStream dout = null; try { // Sending id and challenge dout = new DataOutputStream(sock.getOutputStream()); dout.writeLong(self.getId()); dout.flush(); } catch (IOException e) { LOG.warn("Ignoring exception reading or writing challenge: ", e); closeSocket(sock); return false; } // If lost the challenge, then drop the new connection if (sid > self.getId()) { LOG.info("Have smaller server identifier, so dropping the " + "connection: (" + sid + ", " + self.getId() + ")"); closeSocket(sock); // Otherwise proceed with the connection } else { SendWorker sw = new SendWorker(sock, sid); RecvWorker rw = new RecvWorker(sock, sid, sw); sw.setRecv(rw); SendWorker vsw = senderWorkerMap.get(sid); if(vsw != null) vsw.finish(); senderWorkerMap.put(sid, sw); if (!queueSendMap.containsKey(sid)) { queueSendMap.put(sid, new ArrayBlockingQueue<ByteBuffer>( SEND_CAPACITY)); } sw.start(); rw.start(); return true; } return false; } /** * If this server receives a connection request, then it gives up on the new * connection if it wins. Notice that it checks whether it has a connection * to this server already or not. If it does, then it sends the smallest * possible long value to lose the challenge. * */ public boolean receiveConnection(Socket sock) { Long sid = null; try { // Read server id DataInputStream din = new DataInputStream(sock.getInputStream()); sid = din.readLong(); if (sid == QuorumPeer.OBSERVER_ID) { /* * Choose identifier at random. We need a value to identify * the connection. */ sid = observerCounter--; LOG.info("Setting arbitrary identifier to observer: " + sid); } } catch (IOException e) { closeSocket(sock); LOG.warn("Exception reading or writing challenge: " + e.toString()); return false; } //If wins the challenge, then close the new connection. if (sid < self.getId()) { /* * This replica might still believe that the connection to sid is * up, so we have to shut down the workers before trying to open a * new connection. */ SendWorker sw = senderWorkerMap.get(sid); if (sw != null) { sw.finish(); } /* * Now we start a new connection */ LOG.debug("Create new connection to server: " + sid); closeSocket(sock); connectOne(sid); // Otherwise start worker threads to receive data. } else { SendWorker sw = new SendWorker(sock, sid); RecvWorker rw = new RecvWorker(sock, sid, sw); sw.setRecv(rw); SendWorker vsw = senderWorkerMap.get(sid); if(vsw != null) vsw.finish(); senderWorkerMap.put(sid, sw); if (!queueSendMap.containsKey(sid)) { queueSendMap.put(sid, new ArrayBlockingQueue<ByteBuffer>( SEND_CAPACITY)); } sw.start(); rw.start(); return true; } return false; } /** * Processes invoke this message to queue a message to send. Currently, * only leader election uses it. */ public void toSend(Long sid, ByteBuffer b) { /* * If sending message to myself, then simply enqueue it (loopback). */ if (self.getId() == sid) { b.position(0); addToRecvQueue(new Message(b.duplicate(), sid)); /* * Otherwise send to the corresponding thread to send. */ } else { /* * Start a new connection if doesn't have one already. */ if (!queueSendMap.containsKey(sid)) { ArrayBlockingQueue<ByteBuffer> bq = new ArrayBlockingQueue<ByteBuffer>( SEND_CAPACITY); queueSendMap.put(sid, bq); addToSendQueue(bq, b); } else { ArrayBlockingQueue<ByteBuffer> bq = queueSendMap.get(sid); if(bq != null){ addToSendQueue(bq, b); } else { LOG.error("No queue for server " + sid); } } connectOne(sid); } } /** * Try to establish a connection to server with id sid. * * @param sid server id */ synchronized void connectOne(long sid){ if (senderWorkerMap.get(sid) == null){ InetSocketAddress electionAddr; if (self.quorumPeers.containsKey(sid)) { electionAddr = self.quorumPeers.get(sid).electionAddr; } else { LOG.warn("Invalid server id: " + sid); return; } try { if (LOG.isDebugEnabled()) { LOG.debug("Opening channel to server " + sid); } Socket sock = new Socket(); setSockOpts(sock); sock.connect(self.getView().get(sid).electionAddr, cnxTO); if (LOG.isDebugEnabled()) { LOG.debug("Connected to server " + sid); } initiateConnection(sock, sid); } catch (UnresolvedAddressException e) { // Sun doesn't include the address that causes this // exception to be thrown, also UAE cannot be wrapped cleanly // so we log the exception in order to capture this critical // detail. LOG.warn("Cannot open channel to " + sid + " at election address " + electionAddr, e); throw e; } catch (IOException e) { LOG.warn("Cannot open channel to " + sid + " at election address " + electionAddr, e); } } else { LOG.debug("There is a connection already for server " + sid); } } /** * Try to establish a connection with each server if one * doesn't exist. */ public void connectAll(){ long sid; for(Enumeration<Long> en = queueSendMap.keys(); en.hasMoreElements();){ sid = en.nextElement(); connectOne(sid); } } /** * Check if all queues are empty, indicating that all messages have been delivered. */ boolean haveDelivered() { for (ArrayBlockingQueue<ByteBuffer> queue : queueSendMap.values()) { LOG.debug("Queue size: " + queue.size()); if (queue.size() == 0) { return true; } } return false; } /** * Flag that it is time to wrap up all activities and interrupt the listener. */ public void halt() { shutdown = true; LOG.debug("Halting listener"); listener.halt(); softHalt(); } /** * A soft halt simply finishes workers. */ public void softHalt() { for (SendWorker sw : senderWorkerMap.values()) { LOG.debug("Halting sender: " + sw); sw.finish(); } } /** * Helper method to set socket options. * * @param sock * Reference to socket */ private void setSockOpts(Socket sock) throws SocketException { sock.setTcpNoDelay(true); sock.setSoTimeout(self.tickTime * self.syncLimit); } /** * Helper method to close a socket. * * @param sock * Reference to socket */ private void closeSocket(Socket sock) { try { sock.close(); } catch (IOException ie) { LOG.error("Exception while closing", ie); } } /** * Return number of worker threads */ public long getThreadCount() { return threadCnt.get(); } /** * Return reference to QuorumPeer */ public QuorumPeer getQuorumPeer() { return self; } /** * Thread to listen on some port */ public class Listener extends Thread { volatile ServerSocket ss = null; /** * Sleeps on accept(). */ @Override public void run() { int numRetries = 0; while((!shutdown) && (numRetries < 3)){ try { ss = new ServerSocket(); ss.setReuseAddress(true); int port = self.quorumPeers.get(self.getId()).electionAddr .getPort(); InetSocketAddress addr = new InetSocketAddress(port); LOG.info("My election bind port: " + addr.toString()); setName(self.quorumPeers.get(self.getId()).electionAddr .toString()); ss.bind(addr); while (!shutdown) { Socket client = ss.accept(); setSockOpts(client); LOG.info("Received connection request " + client.getRemoteSocketAddress()); receiveConnection(client); numRetries = 0; } } catch (IOException e) { LOG.error("Exception while listening", e); numRetries++; try { ss.close(); Thread.sleep(1000); } catch (IOException ie) { LOG.error("Error closing server socket", ie); } catch (InterruptedException ie) { LOG.error("Interrupted while sleeping. " + "Ignoring exception", ie); } } } LOG.info("Leaving listener"); if (!shutdown) { LOG.error("As I'm leaving the listener thread, " + "I won't be able to participate in leader " + "election any longer: " + self.quorumPeers.get(self.getId()).electionAddr); } } /** * Halts this listener thread. */ void halt(){ try{ LOG.debug("Trying to close listener: " + ss); if(ss != null) { LOG.debug("Closing listener: " + self.getId()); ss.close(); } } catch (IOException e){ LOG.warn("Exception when shutting down listener: " + e); } } } /** * Thread to send messages. Instance waits on a queue, and send a message as * soon as there is one available. If connection breaks, then opens a new * one. */ class SendWorker extends Thread { Long sid; Socket sock; RecvWorker recvWorker; volatile boolean running = true; DataOutputStream dout; /** * An instance of this thread receives messages to send * through a queue and sends them to the server sid. * * @param sock * Socket to remote peer * @param sid * Server identifier of remote peer */ SendWorker(Socket sock, Long sid) { super("SendWorker:" + sid); this.sid = sid; this.sock = sock; recvWorker = null; try { dout = new DataOutputStream(sock.getOutputStream()); } catch (IOException e) { LOG.error("Unable to access socket output stream", e); closeSocket(sock); running = false; } LOG.debug("Address of remote peer: " + this.sid); } synchronized void setRecv(RecvWorker recvWorker) { this.recvWorker = recvWorker; } /** * Returns RecvWorker that pairs up with this SendWorker. * * @return RecvWorker */ synchronized RecvWorker getRecvWorker(){ return recvWorker; } synchronized boolean finish() { if (LOG.isDebugEnabled()) { LOG.debug("Calling finish for " + sid); } if(!running){ /* * Avoids running finish() twice. */ return running; } running = false; closeSocket(sock); // channel = null; this.interrupt(); if (recvWorker != null) { recvWorker.finish(); } if (LOG.isDebugEnabled()) { LOG.debug("Removing entry from senderWorkerMap sid=" + sid); } senderWorkerMap.remove(sid, this); threadCnt.decrementAndGet(); return running; } synchronized void send(ByteBuffer b) throws IOException { byte[] msgBytes = new byte[b.capacity()]; try { b.position(0); b.get(msgBytes); } catch (BufferUnderflowException be) { LOG.error("BufferUnderflowException ", be); return; } dout.writeInt(b.capacity()); dout.write(b.array()); dout.flush(); } @Override public void run() { threadCnt.incrementAndGet(); try { /** * If there is nothing in the queue to send, then we * send the lastMessage to ensure that the last message * was received by the peer. The message could be dropped * in case self or the peer shutdown their connection * (and exit the thread) prior to reading/processing * the last message. Duplicate messages are handled correctly * by the peer. * * If the send queue is non-empty, then we have a recent * message than that stored in lastMessage. To avoid sending * stale message, we should send the message in the send queue. */ ArrayBlockingQueue<ByteBuffer> bq = queueSendMap.get(sid); if (bq == null || isSendQueueEmpty(bq)) { ByteBuffer b = lastMessageSent.get(sid); if (b != null) { LOG.debug("Attempting to send lastMessage to sid=" + sid); send(b); } } } catch (IOException e) { LOG.error("Failed to send last message. Shutting down thread.", e); this.finish(); } try { while (running && !shutdown && sock != null) { ByteBuffer b = null; try { ArrayBlockingQueue<ByteBuffer> bq = queueSendMap .get(sid); if (bq != null) { b = pollSendQueue(bq, 1000, TimeUnit.MILLISECONDS); } else { LOG.error("No queue of incoming messages for " + "server " + sid); break; } if(b != null){ lastMessageSent.put(sid, b); send(b); } } catch (InterruptedException e) { LOG.warn("Interrupted while waiting for message on queue", e); } } } catch (Exception e) { LOG.warn("Exception when using channel: for id " + sid + " my id = " + self.getId() + " error = " + e); } this.finish(); LOG.warn("Send worker leaving thread"); } } /** * Thread to receive messages. Instance waits on a socket read. If the * channel breaks, then removes itself from the pool of receivers. */ class RecvWorker extends Thread { Long sid; Socket sock; volatile boolean running = true; DataInputStream din; final SendWorker sw; RecvWorker(Socket sock, Long sid, SendWorker sw) { super("RecvWorker:" + sid); this.sid = sid; this.sock = sock; this.sw = sw; try { din = new DataInputStream(sock.getInputStream()); // OK to wait until socket disconnects while reading. sock.setSoTimeout(0); } catch (IOException e) { LOG.error("Error while accessing socket for " + sid, e); closeSocket(sock); running = false; } } /** * Shuts down this worker * * @return boolean Value of variable running */ synchronized boolean finish() { if(!running){ /* * Avoids running finish() twice. */ return running; } running = false; this.interrupt(); threadCnt.decrementAndGet(); return running; } @Override public void run() { threadCnt.incrementAndGet(); try { while (running && !shutdown && sock != null) { /** * Reads the first int to determine the length of the * message */ int length = din.readInt(); if (length <= 0 || length > PACKETMAXSIZE) { throw new IOException( "Received packet with invalid packet: " + length); } /** * Allocates a new ByteBuffer to receive the message */ byte[] msgArray = new byte[length]; din.readFully(msgArray, 0, length); ByteBuffer message = ByteBuffer.wrap(msgArray); addToRecvQueue(new Message(message.duplicate(), sid)); } } catch (Exception e) { LOG.warn("Connection broken for id " + sid + ", my id = " + self.getId() + ", error = " , e); } finally { LOG.warn("Interrupting SendWorker"); sw.finish(); if (sock != null) { closeSocket(sock); } } } } /** * Inserts an element in the specified queue. If the Queue is full, this * method removes an element from the head of the Queue and then inserts * the element at the tail. It can happen that the an element is removed * by another thread in {@link SendWorker#processMessage() processMessage} * method before this method attempts to remove an element from the queue. * This will cause {@link ArrayBlockingQueue#remove() remove} to throw an * exception, which is safe to ignore. * * Unlike {@link #addToRecvQueue(Message) addToRecvQueue} this method does * not need to be synchronized since there is only one thread that inserts * an element in the queue and another thread that reads from the queue. * * @param queue * Reference to the Queue * @param buffer * Reference to the buffer to be inserted in the queue */ private void addToSendQueue(ArrayBlockingQueue<ByteBuffer> queue, ByteBuffer buffer) { if (queue.remainingCapacity() == 0) { try { queue.remove(); } catch (NoSuchElementException ne) { // element could be removed by poll() LOG.debug("Trying to remove from an empty " + "Queue. Ignoring exception " + ne); } } try { queue.add(buffer); } catch (IllegalStateException ie) { // This should never happen LOG.error("Unable to insert an element in the queue " + ie); } } /** * Returns true if queue is empty. * @param queue * Reference to the queue * @return * true if the specified queue is empty */ private boolean isSendQueueEmpty(ArrayBlockingQueue<ByteBuffer> queue) { return queue.isEmpty(); } /** * Retrieves and removes buffer at the head of this queue, * waiting up to the specified wait time if necessary for an element to * become available. * * {@link ArrayBlockingQueue#poll(long, java.util.concurrent.TimeUnit)} */ private ByteBuffer pollSendQueue(ArrayBlockingQueue<ByteBuffer> queue, long timeout, TimeUnit unit) throws InterruptedException { return queue.poll(timeout, unit); } /** * Inserts an element in the {@link #recvQueue}. If the Queue is full, this * methods removes an element from the head of the Queue and then inserts * the element at the tail of the queue. * * This method is synchronized to achieve fairness between two threads that * are trying to insert an element in the queue. Each thread checks if the * queue is full, then removes the element at the head of the queue, and * then inserts an element at the tail. This three-step process is done to * prevent a thread from blocking while inserting an element in the queue. * If we do not synchronize the call to this method, then a thread can grab * a slot in the queue created by the second thread. This can cause the call * to insert by the second thread to fail. * Note that synchronizing this method does not block another thread * from polling the queue since that synchronization is provided by the * queue itself. * * @param msg * Reference to the message to be inserted in the queue */ public void addToRecvQueue(Message msg) { synchronized(recvQLock) { if (recvQueue.remainingCapacity() == 0) { try { recvQueue.remove(); } catch (NoSuchElementException ne) { // element could be removed by poll() LOG.debug("Trying to remove from an empty " + "recvQueue. Ignoring exception " + ne); } } try { recvQueue.add(msg); } catch (IllegalStateException ie) { // This should never happen LOG.error("Unable to insert element in the recvQueue " + ie); } } } /** * Retrieves and removes a message at the head of this queue, * waiting up to the specified wait time if necessary for an element to * become available. * * {@link ArrayBlockingQueue#poll(long, java.util.concurrent.TimeUnit)} */ public Message pollRecvQueue(long timeout, TimeUnit unit) throws InterruptedException { return recvQueue.poll(timeout, unit); } }