/* This file is part of VoltDB.
* Copyright (C) 2008-2017 VoltDB Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with VoltDB. If not, see <http://www.gnu.org/licenses/>.
*/
package org.voltcore.messaging;
import java.io.EOFException;
import java.io.IOException;
import java.net.InetAddress;
import java.net.InetSocketAddress;
import java.net.SocketAddress;
import java.nio.ByteBuffer;
import java.nio.channels.ClosedByInterruptException;
import java.nio.channels.ClosedSelectorException;
import java.nio.channels.SelectionKey;
import java.nio.channels.Selector;
import java.nio.channels.ServerSocketChannel;
import java.nio.channels.SocketChannel;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Random;
import java.util.Set;
import java.util.TreeSet;
import java.util.concurrent.Callable;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import org.json_voltpatches.JSONArray;
import org.json_voltpatches.JSONException;
import org.json_voltpatches.JSONObject;
import org.voltcore.logging.Level;
import org.voltcore.logging.VoltLogger;
import org.voltcore.network.ReverseDNSCache;
import org.voltcore.utils.CoreUtils;
import org.voltcore.utils.VersionChecker;
import com.google_voltpatches.common.collect.ImmutableMap;
import com.google_voltpatches.common.net.HostAndPort;
/**
* SocketJoiner runs all the time listening for new nodes in the cluster. Since it is a dedicated thread
* it is able to block while a new node joins without disrupting other activities.
*
* At startup socket joiner will connect to the rest of the cluster in the start method if it fails
* to bind to the leader address.
*
* If it binds to the leader address and becomes the leader the start method returns immediately and runPrimary
* is run from a separate thread. runPrimary will wait for the countdown latch for bootstrapping zk to count down
* before accepting new connections
*/
public class SocketJoiner {
static final String HOSTS = "hosts";
static final String REPORTED_ADDRESS = "reportedAddress";
static final String NEW_HOST_ID = "newHostId";
static final String REASON = "reason";
static final String MAY_RETRY = "mayRetry";
static final String ACCEPTED = "accepted";
private static final String MAY_EXCHANGE_TS = "mayExchangeTs";
private static final String TYPE = "type";
static final String HOST_ID = "hostId";
static final String PORT = "port";
static final String ADDRESS = "address";
private static final String VERSION_COMPATIBLE = "versionCompatible";
private static final String BUILD_STRING = "buildString";
public static final String VERSION_STRING = "versionString";
private static final int MAX_CLOCKSKEW = Integer.getInteger("MAX_CLOCKSKEW", 200);
private static final int RETRY_INTERVAL = Integer.getInteger("MESH_JOIN_RETRY_INTERVAL", 10);
private static final int RETRY_INTERVAL_SALT = Integer.getInteger("MESH_JOIN_RETRY_INTERVAL_SALT", 30);
private static final int CRITICAL_CLOCKSKEW = 100;
/**
* Supports quick probes for request host id attempts to seed nodes
*/
enum ConnectStrategy {
CONNECT, PROBE
}
enum ConnectionType {
REQUEST_HOSTID,
PUBLISH_HOSTID,
REQUEST_CONNECTION;
}
/**
* Interface into host messenger to notify it of new connections.
*
*/
public interface JoinHandler {
/*
* Notify that a specific host has joined with the specified host id.
*/
public void notifyOfJoin(
int hostId,
SocketChannel socket,
InetSocketAddress listeningAddress,
JSONObject jo);
/*
* A node wants to join the socket mesh
*/
public void requestJoin(
SocketChannel socket,
InetSocketAddress listeningAddress,
JSONObject jo) throws Exception;
/*
* A connection has been made to all of the specified hosts. Invoked by
* nodes connected to the cluster
*/
public void notifyOfHosts(
int yourLocalHostId,
int hosts[],
SocketChannel sockets[],
InetSocketAddress listeningAddresses[],
Map<Integer, JSONObject> jos) throws Exception;
/*
* Create new connection between given node and current node
*/
public void notifyOfConnection(
int hostId,
SocketChannel socket,
InetSocketAddress listeningAddress) throws Exception;
}
private static class RequestHostIdResponse {
final private JSONObject m_leaderInfo;
final private JSONObject m_responseBody;
public RequestHostIdResponse(JSONObject leaderInfo, JSONObject responseBody) {
m_leaderInfo = leaderInfo;
m_responseBody = responseBody;
}
JSONObject getLeaderInfo() { return m_leaderInfo; }
JSONObject getResponseBody() { return m_responseBody; }
}
private static final VoltLogger LOG = new VoltLogger("JOINER");
private static final VoltLogger consoleLog = new VoltLogger("CONSOLE");
private static final VoltLogger hostLog = new VoltLogger("HOST");
private final ExecutorService m_es = CoreUtils.getSingleThreadExecutor("Socket Joiner");
InetSocketAddress m_coordIp = null;
int m_localHostId = 0;
private final List<ServerSocketChannel> m_listenerSockets = new ArrayList<ServerSocketChannel>();
private Selector m_selector;
private final JoinHandler m_joinHandler;
// from configuration data
int m_internalPort = 3021;
String m_internalInterface = "";
/*
* The interface we connected to the leader on
*/
String m_reportedInternalInterface;
public boolean start(final CountDownLatch externalInitBarrier) {
boolean retval = false;
/*
* probe coordinator host list for leader candidates that may are operational
* (i.e. node state is operational)
*/
m_coordIp = null;
for (String coordHost: m_acceptor.getCoordinators()) {
if (m_coordIp != null) {
break;
}
HostAndPort host = HostAndPort.fromString(coordHost)
.withDefaultPort(org.voltcore.common.Constants.DEFAULT_INTERNAL_PORT);
InetSocketAddress ip = !host.getHostText().isEmpty() ?
new InetSocketAddress(host.getHostText(), host.getPort())
: new InetSocketAddress(host.getPort());
/*
* On an operational leader (i.e. node is up) the request to join the cluster
* may be rejected, e.g. multiple hosts rejoining at the same time. In this case,
* the code will retry.
*/
long retryInterval = RETRY_INTERVAL;
final Random salt = new Random();
while (true) {
try {
connectToPrimary(ip, ConnectStrategy.PROBE);
break;
} catch (CoreUtils.RetryException e) {
LOG.warn(String.format("Request to join cluster mesh is rejected, retrying in %d seconds. %s",
retryInterval, e.getMessage()));
try {
Thread.sleep(TimeUnit.SECONDS.toMillis(retryInterval));
} catch (InterruptedException ignoreIt) {
}
// exponential back off with a salt to avoid collision. Max is 5 minutes.
retryInterval = (Math.min(retryInterval * 2, TimeUnit.MINUTES.toSeconds(5)) +
salt.nextInt(RETRY_INTERVAL_SALT));
//Over waiting may occur in some cases.
//For example, there are 4 rejoining nodes. Node 1 may take over 5 min to be completed.
//Nodes 2 to 4 continue to wait after they detect that node 1 is still rejoining right before its rejoining is completed
//They will wait 5 min + salt before sending another rejoining request. All the following rejoining requests are sent
//after 5 min + salt. Reset waiting time to avoid over waiting.
if (retryInterval > TimeUnit.MINUTES.toSeconds(5)) {
retryInterval = RETRY_INTERVAL;
}
} catch (Exception e) {
hostLog.error("Failed to establish socket mesh.", e);
throw new RuntimeException("Failed to establish socket mesh with " + m_coordIp, e);
}
}
}
boolean haveMeshedLeader = m_coordIp != null;
/*
* if none were found pick the first one in lexicographical order
*/
if (m_coordIp == null) {
HostAndPort leader = m_acceptor.getLeader();
m_coordIp = !leader.getHostText().isEmpty() ?
new InetSocketAddress(leader.getHostText(), leader.getPort())
: new InetSocketAddress(leader.getPort());
}
if (!haveMeshedLeader && m_coordIp.getPort() == m_internalPort) {
try {
hostLog.info("Attempting to bind to leader ip " + m_coordIp);
ServerSocketChannel listenerSocket = ServerSocketChannel.open();
listenerSocket.socket().bind(m_coordIp);
listenerSocket.socket().setPerformancePreferences(0, 2, 1);
listenerSocket.configureBlocking(false);
m_listenerSockets.add(listenerSocket);
}
catch (IOException e) {
if (!m_listenerSockets.isEmpty()) {
try {
m_listenerSockets.get(0).close();
m_listenerSockets.clear();
}
catch (IOException ex) {
new VoltLogger(SocketJoiner.class.getName()).l7dlog(Level.FATAL, null, ex);
}
}
}
}
if (!m_listenerSockets.isEmpty()) {
// if an internal interface was specified, see if it matches any
// of the forms of the leader address we've bound to.
if (m_internalInterface != null && !m_internalInterface.equals("")) {
if (!m_internalInterface.equals(ReverseDNSCache.hostnameOrAddress(m_coordIp.getAddress())) &&
!m_internalInterface.equals(m_coordIp.getAddress().getCanonicalHostName()) &&
!m_internalInterface.equals(m_coordIp.getAddress().getHostAddress()))
{
org.voltdb.VoltDB.crashLocalVoltDB(
String.format("The provided internal interface (%s) does not match the "
+ "specified leader address (%s, %s). "
+ "This will result in either a cluster which fails to start or an unintended network topology. "
+ "The leader will now exit; correct your specified leader and interface and try restarting.",
m_internalInterface,
ReverseDNSCache.hostnameOrAddress(m_coordIp.getAddress()),
m_coordIp.getAddress().getHostAddress()),
false, null);
}
}
retval = true;
consoleLog.info("Connecting to VoltDB cluster as the leader...");
/*
* Need to wait for external initialization to complete before
* accepting new connections. This is slang for the leader
* creating an agreement site that agrees with itself
*/
m_es.submit(new Callable<Object>() {
@Override
public Object call() throws Exception {
externalInitBarrier.await();
return null;
}
});
}
else if (!haveMeshedLeader) {
consoleLog.info("Connecting to the VoltDB cluster leader " + m_coordIp);
try {
connectToPrimary(m_coordIp, ConnectStrategy.CONNECT);
} catch (Exception e) {
hostLog.error("Failed to establish socket mesh.", e);
throw new RuntimeException("Failed to establish socket mesh with " + m_coordIp, e);
}
}
/*
* Submit a task to start the main run loop,
* will wait for agreement to be initialized if this
* is the leader using the previously queued runnable
*/
m_es.submit(new Runnable() {
@Override
public void run() {
try {
runPrimary();
} catch (InterruptedException e) {
} catch (Throwable e) {
org.voltdb.VoltDB.crashLocalVoltDB("Error in socket joiner run loop", true, e);
}
}
});
return retval;
}
/** Set to true when the thread exits correctly. */
private final boolean success = false;
private final AtomicBoolean m_paused;
private final JoinAcceptor m_acceptor;
public boolean getSuccess() {
return success;
}
public SocketJoiner(
String internalInterface,
int internalPort,
AtomicBoolean isPaused,
JoinAcceptor acceptor,
JoinHandler jh) {
if (internalInterface == null || jh == null || acceptor == null) {
throw new IllegalArgumentException();
}
m_joinHandler = jh;
m_internalInterface = internalInterface;
m_internalPort = internalPort;
m_paused = isPaused;
m_acceptor = acceptor;
}
/*
* Bind to the internal interface if one was specified,
* otherwise bind on all interfaces. The leader won't invoke this.
*/
private void doBind() throws Exception {
LOG.debug("Creating listener socket");
try {
m_selector = Selector.open();
} catch (IOException e) {
throw new RuntimeException(e);
}
ServerSocketChannel listenerSocket = ServerSocketChannel.open();
InetSocketAddress inetsockaddr;
if ((m_internalInterface == null) || (m_internalInterface.length() == 0)) {
inetsockaddr = new InetSocketAddress(m_internalPort);
}
else {
inetsockaddr = new InetSocketAddress(m_internalInterface, m_internalPort);
}
try {
hostLog.info("Attempting to bind to internal ip " + inetsockaddr);
listenerSocket.socket().bind(inetsockaddr);
listenerSocket.configureBlocking(false);
m_listenerSockets.add(listenerSocket);
} catch (Exception e) {
/*
* If we bound to the leader address, the internal interface address might not
* bind if it is all interfaces
*/
if (m_listenerSockets.isEmpty()) {
LOG.fatal("Failed to bind to " + inetsockaddr);
CoreUtils.printPortsInUse(hostLog);
throw e;
}
}
for (ServerSocketChannel ssc : m_listenerSockets) {
ssc.register(m_selector, SelectionKey.OP_ACCEPT);
}
if (LOG.isDebugEnabled()) {
LOG.debug("Non-Primary Listening on:" + inetsockaddr.toString());
}
}
/**
* Read a length prefixed JSON message
*/
private JSONObject readJSONObjFromWire(SocketChannel sc, String remoteAddressForErrorMsg) throws IOException, JSONException {
// length prefix
ByteBuffer lengthBuffer = ByteBuffer.allocate(4);
while (lengthBuffer.remaining() > 0) {
int read = sc.read(lengthBuffer);
if (read == -1) {
throw new EOFException(remoteAddressForErrorMsg);
}
}
lengthBuffer.flip();
int length = lengthBuffer.getInt();
// don't allow for a crazy unallocatable json payload
if (length > 16 * 1024) {
throw new IOException(
"Length prefix on wire for expected JSON string is greater than 16K max.");
}
if (length < 2) {
throw new IOException(
"Length prefix on wire for expected JSON string is less than minimum document size of 2.");
}
// content
ByteBuffer messageBytes = ByteBuffer.allocate(length);
while (messageBytes.hasRemaining()) {
int read = sc.read(messageBytes);
if (read == -1) {
throw new EOFException(remoteAddressForErrorMsg);
}
}
messageBytes.flip();
JSONObject jsObj = new JSONObject(new String(messageBytes.array(), StandardCharsets.UTF_8));
return jsObj;
}
/*
* Pull all ready to accept sockets
*/
private void processSSC(ServerSocketChannel ssc) throws Exception {
SocketChannel sc = null;
while ((sc = ssc.accept()) != null) {
try {
sc.socket().setTcpNoDelay(true);
sc.socket().setPerformancePreferences(0, 2, 1);
final String remoteAddress = sc.socket().getRemoteSocketAddress().toString();
/*
* Send the current time over the new connection for a clock skew check
*/
ByteBuffer currentTimeBuf = ByteBuffer.allocate(8);
currentTimeBuf.putLong(System.currentTimeMillis());
currentTimeBuf.flip();
while (currentTimeBuf.hasRemaining()) {
sc.write(currentTimeBuf);
}
/*
* Read a length prefixed JSON message
*/
JSONObject jsObj = readJSONObjFromWire(sc, remoteAddress);
LOG.info(jsObj.toString(2));
// get the connecting node's version string
String remoteBuildString = jsObj.getString(VERSION_STRING);
VersionChecker versionChecker = m_acceptor.getVersionChecker();
// send a response with version/build data of this node
JSONObject returnJs = new JSONObject();
returnJs.put(VERSION_STRING, versionChecker.getVersionString());
returnJs.put(BUILD_STRING, versionChecker.getBuildString());
returnJs.put(VERSION_COMPATIBLE,
versionChecker.isCompatibleVersionString(remoteBuildString));
// inject acceptor fields
m_acceptor.decorate(returnJs, Optional.of(m_paused.get()));
byte jsBytes[] = returnJs.toString(4).getBytes(StandardCharsets.UTF_8);
ByteBuffer returnJsBuffer = ByteBuffer.allocate(4 + jsBytes.length);
returnJsBuffer.putInt(jsBytes.length);
returnJsBuffer.put(jsBytes).flip();
while (returnJsBuffer.hasRemaining()) {
sc.write(returnJsBuffer);
}
/*
* The type of connection, it can be a new request to join the cluster
* or a node that is connecting to the rest of the cluster and publishing its
* host id or a request to add a new connection to the request node.
*/
String type = jsObj.getString(TYPE);
/*
* The new connection may specify the address it is listening on,
* or it can be derived from the connection itself
*/
InetSocketAddress listeningAddress;
if (jsObj.has(ADDRESS)) {
listeningAddress = new InetSocketAddress(
InetAddress.getByName(jsObj.getString(ADDRESS)),
jsObj.getInt(PORT));
} else {
listeningAddress =
new InetSocketAddress(
((InetSocketAddress)sc.socket().
getRemoteSocketAddress()).getAddress().getHostAddress(),
jsObj.getInt(PORT));
}
hostLog.info("Received request type " + type);
if (type.equals(ConnectionType.REQUEST_HOSTID.name())) {
m_joinHandler.requestJoin(sc, listeningAddress, jsObj);
} else if (type.equals(ConnectionType.PUBLISH_HOSTID.name())){
m_joinHandler.notifyOfJoin(jsObj.getInt(HOST_ID), sc, listeningAddress, jsObj);
} else if (type.equals(ConnectionType.REQUEST_CONNECTION.name())) {
m_joinHandler.notifyOfConnection(jsObj.getInt(HOST_ID), sc, listeningAddress);
} else {
throw new RuntimeException("Unexpected message type " + type + " from " + remoteAddress);
}
} catch (Exception ex) {
// do not leak sockets when exception happens
try {
sc.close();
} catch (IOException ioex) {
// ignore the close exception on purpose
}
// re-throw the exception, it will be handled by the caller
throw ex;
}
}
}
/*
* After startup everything is a primary and can accept
* new nodes into the cluster. This loop accepts the new socket
* and passes it off the HostMessenger via the JoinHandler interface
*/
private void runPrimary() throws Exception {
try {
// start the server socket on the right interface
doBind();
while (true) {
try {
final int selectedKeyCount = m_selector.select();
if (selectedKeyCount == 0) continue;
Set<SelectionKey> selectedKeys = m_selector.selectedKeys();
try {
for (SelectionKey key : selectedKeys) {
processSSC((ServerSocketChannel)key.channel());
}
} finally {
selectedKeys.clear();
}
} catch (ClosedByInterruptException e) {
throw new InterruptedException();
} catch (ClosedSelectorException e) {
throw new InterruptedException();
} catch (Exception e) {
LOG.error("fault occurrent in the connection accept loop", e);
}
}
}
finally {
for (ServerSocketChannel ssc : m_listenerSockets) {
try {
ssc.close();
} catch (IOException e) {
}
}
m_listenerSockets.clear();
try {
m_selector.close();
} catch (IOException e) {
}
m_selector = null;
}
}
/**
* Read version info from a socket and check compatibility.
* After verifying versions return if "paused" start is indicated. True if paused start otherwise normal start.
*/
private JSONObject processJSONResponse(SocketChannel sc,
String remoteAddress,
Set<String> activeVersions,
boolean checkVersion) throws IOException, JSONException
{
// read the json response from socketjoiner with version info
JSONObject jsonResponse = readJSONObjFromWire(sc, remoteAddress);
if (!checkVersion) {
return jsonResponse;
}
VersionChecker versionChecker = m_acceptor.getVersionChecker();
String remoteVersionString = jsonResponse.getString(VERSION_STRING);
String remoteBuildString = jsonResponse.getString(BUILD_STRING);
boolean remoteAcceptsLocalVersion = jsonResponse.getBoolean(VERSION_COMPATIBLE);
if (remoteVersionString.equals(versionChecker.getVersionString())) {
if (!versionChecker.getBuildString().equals(remoteBuildString)) {
// ignore test/eclipse build string so tests still work
if (!versionChecker.getBuildString().equals("VoltDB") && !remoteBuildString.equals("VoltDB")) {
org.voltdb.VoltDB.crashLocalVoltDB("For VoltDB version " + versionChecker.getVersionString() +
" git tag/hash is not identical across the cluster. Node join failed.\n" +
" joining build string: " + versionChecker.getBuildString() + "\n" +
" existing build string: " + remoteBuildString, false, null);
return null;
}
}
}
else if (!remoteAcceptsLocalVersion) {
if (!versionChecker.isCompatibleVersionString(remoteVersionString)) {
org.voltdb.VoltDB.crashLocalVoltDB("Cluster contains nodes running VoltDB version " + remoteVersionString +
" which is incompatibile with local version " + versionChecker.getVersionString() +
".\n", false, null);
return null;
}
}
//Do this only after we think we are compatible.
activeVersions.add(remoteVersionString);
return jsonResponse;
}
/**
* Create socket to the leader node
*/
private SocketChannel createLeaderSocket(
SocketAddress hostAddr,
ConnectStrategy mode) throws IOException
{
SocketChannel socket = null;
int connectAttempts = 0;
while (socket == null) {
try {
socket = SocketChannel.open();
socket.socket().connect(hostAddr, 5000);
}
catch (java.net.ConnectException
|java.nio.channels.UnresolvedAddressException
|java.net.NoRouteToHostException
|java.net.PortUnreachableException e)
{
// reset the socket to null for loop purposes
socket = null;
if (mode == ConnectStrategy.PROBE) {
return null;
}
if (connectAttempts >= 8) {
LOG.warn("Joining primary failed: " + e.getMessage() + " retrying..");
}
try {
Thread.sleep(250); // milliseconds
}
catch (InterruptedException dontcare) {}
}
++connectAttempts;
}
return socket;
}
/**
* Create socket to the given host
*/
private SocketChannel connectToHost(SocketAddress hostAddr)
throws IOException
{
SocketChannel socket = null;
while (socket == null) {
try {
socket = SocketChannel.open(hostAddr);
}
catch (java.net.ConnectException e) {
LOG.warn("Joining host failed: " + e.getMessage() + " retrying..");
try {
Thread.sleep(250); // milliseconds
}
catch (InterruptedException dontcare) {}
}
}
return socket;
}
/**
* Connection handshake to the leader, ask the leader to assign a host Id
* for current node.
* @param
* @return array of two JSON objects, first is leader info, second is
* the response to our request
* @throws Exception
*/
private RequestHostIdResponse requestHostId (
SocketChannel socket,
List<Long> skews,
Set<String> activeVersions) throws Exception
{
// Read the timestamp off the wire and calculate skew for this connection
ByteBuffer currentTimeBuf = ByteBuffer.allocate(8);
while (currentTimeBuf.hasRemaining()) {
socket.read(currentTimeBuf);
}
currentTimeBuf.flip();
long skew = System.currentTimeMillis() - currentTimeBuf.getLong();
skews.add(skew);
VersionChecker versionChecker = m_acceptor.getVersionChecker();
activeVersions.add(versionChecker.getVersionString());
JSONObject jsObj = new JSONObject();
jsObj.put(TYPE, ConnectionType.REQUEST_HOSTID.name());
// put the version compatibility status in the json
jsObj.put(VERSION_STRING, versionChecker.getVersionString());
// Advertise the port we are going to listen on based on config
jsObj.put(PORT, m_internalPort);
// If config specified an internal interface use that.
// Otherwise the leader will echo back what we connected on
if (!m_internalInterface.isEmpty()) {
jsObj.put(ADDRESS, m_internalInterface);
}
// communicate configuration and node state
m_acceptor.decorate(jsObj, Optional.empty());
jsObj.put(MAY_EXCHANGE_TS, true);
byte jsBytes[] = jsObj.toString(4).getBytes(StandardCharsets.UTF_8);
ByteBuffer requestHostIdBuffer = ByteBuffer.allocate(4 + jsBytes.length);
requestHostIdBuffer.putInt(jsBytes.length);
requestHostIdBuffer.put(jsBytes).flip();
while (requestHostIdBuffer.hasRemaining()) {
socket.write(requestHostIdBuffer);
}
final String primaryAddress = socket.socket().getRemoteSocketAddress().toString();
// read the json response from socketjoiner with version info and validate it
JSONObject leaderInfo = processJSONResponse(socket, primaryAddress, activeVersions, true);
// read the json response sent by HostMessenger with HostID
JSONObject jsonObj = readJSONObjFromWire(socket, primaryAddress);
return new RequestHostIdResponse(leaderInfo, jsonObj);
}
/**
* Connection handshake to non-leader node, broadcast the new hostId to each node of the
* cluster (except the leader).
* @param
* @return JSONObject response message from peer node
* @throws Exception
*/
private JSONObject publishHostId(
InetSocketAddress hostAddr,
SocketChannel hostSocket,
List<Long> skews,
Set<String> activeVersions) throws Exception
{
final String remoteAddress = hostSocket.socket().getRemoteSocketAddress().toString();
/*
* Get the clock skew value
*/
ByteBuffer currentTimeBuf = ByteBuffer.allocate(8);
while (currentTimeBuf.hasRemaining()) {
hostSocket.read(currentTimeBuf);
}
currentTimeBuf.flip();
long skew = System.currentTimeMillis() - currentTimeBuf.getLong();
assert(currentTimeBuf.remaining() == 0);
skews.add(skew);
JSONObject jsObj = new JSONObject();
jsObj.put(TYPE, ConnectionType.PUBLISH_HOSTID.name());
jsObj.put(HOST_ID, m_localHostId);
jsObj.put(PORT, m_internalPort);
jsObj.put(ADDRESS,
m_internalInterface.isEmpty() ? m_reportedInternalInterface : m_internalInterface);
jsObj.put(VERSION_STRING, m_acceptor.getVersionChecker().getVersionString());
m_acceptor.decorate(jsObj, Optional.empty());
jsObj.put(MAY_EXCHANGE_TS, true);
byte[] jsBytes = jsObj.toString(4).getBytes(StandardCharsets.UTF_8);
ByteBuffer pushHostId = ByteBuffer.allocate(4 + jsBytes.length);
pushHostId.putInt(jsBytes.length);
pushHostId.put(jsBytes).flip();
while (pushHostId.hasRemaining()) {
hostSocket.write(pushHostId);
}
// read the json response from socketjoiner with version info and validate it
return processJSONResponse(hostSocket, remoteAddress, activeVersions, true);
}
public SocketChannel requestForConnection(InetSocketAddress hostAddr) throws IOException, JSONException
{
SocketChannel socket = connectToHost(hostAddr);
/*
* Get the clock skew value
*/
ByteBuffer currentTimeBuf = ByteBuffer.allocate(8);
while (currentTimeBuf.hasRemaining()) {
socket.read(currentTimeBuf);
}
assert currentTimeBuf.position() == 8 : "time buffer is at an unexpected position";
JSONObject jsObj = new JSONObject();
jsObj.put(TYPE, ConnectionType.REQUEST_CONNECTION.name());
jsObj.put(VERSION_STRING, m_acceptor.getVersionChecker().getVersionString());
jsObj.put(HOST_ID, m_localHostId);
jsObj.put(PORT, m_internalPort);
jsObj.put(ADDRESS,
m_internalInterface.isEmpty() ? m_reportedInternalInterface : m_internalInterface);
byte[] jsBytes = jsObj.toString(4).getBytes(StandardCharsets.UTF_8);
ByteBuffer addConnection = ByteBuffer.allocate(4 + jsBytes.length);
addConnection.putInt(jsBytes.length);
addConnection.put(jsBytes).flip();
while (addConnection.hasRemaining()) {
socket.write(addConnection);
}
// read the json response from socketjoiner with version info and validate it
final String remoteAddress = socket.socket().getRemoteSocketAddress().toString();
processJSONResponse(socket, remoteAddress, null, false);
return socket;
}
/*
* If this node failed to bind to the leader address
* it must connect to the leader which will generate a host id and
* advertise the rest of the cluster so that connectToPrimary can connect to it
*/
private void connectToPrimary(InetSocketAddress coordIp, ConnectStrategy mode) throws Exception {
// collect clock skews from all nodes
List<Long> skews = new ArrayList<Long>();
// collect the set of active voltdb version strings in the cluster
// this is used to limit simulatanious versions to two
Set<String> activeVersions = new TreeSet<String>();
try {
LOG.debug("Non-Primary Starting & Connecting to Primary");
SocketChannel socket = createLeaderSocket(coordIp, mode);
if (socket == null) return; // in probe mode
if (!coordIp.equals(m_coordIp)) {
m_coordIp = coordIp;
}
socket.socket().setTcpNoDelay(true);
socket.socket().setPerformancePreferences(0, 2, 1);
// blocking call, send a request to the leader node and get a host id assigned by the leader
RequestHostIdResponse response = requestHostId(socket, skews, activeVersions);
// check if the membership request is accepted
JSONObject responseBody = response.getResponseBody();
if (!responseBody.optBoolean(ACCEPTED, true)) {
socket.close();
if (!responseBody.optBoolean(MAY_RETRY, false)) {
org.voltdb.VoltDB.crashLocalVoltDB(
"Request to join cluster is rejected: "
+ responseBody.optString(REASON, "rejection reason is not available"));
}
throw new CoreUtils.RetryException(responseBody.optString(REASON, "rejection reason is not available"));
}
/*
* Get the generated host id, and the interface we connected on
* that was echoed back
*/
m_localHostId = responseBody.getInt(NEW_HOST_ID);
m_reportedInternalInterface = responseBody.getString(REPORTED_ADDRESS);
ImmutableMap.Builder<Integer, JSONObject> cmbld = ImmutableMap.builder();
cmbld.put(m_localHostId, m_acceptor.decorate(responseBody, Optional.<Boolean>empty()));
/*
* Loop over all the hosts and create a connection (except for the first entry, that is the leader)
* and publish the host id that was generated. This finishes creating the mesh
*/
JSONArray otherHosts = responseBody.getJSONArray(HOSTS);
int hostIds[] = new int[otherHosts.length()];
SocketChannel hostSockets[] = new SocketChannel[hostIds.length];
InetSocketAddress listeningAddresses[] = new InetSocketAddress[hostIds.length];
for (int ii = 0; ii < otherHosts.length(); ii++) {
JSONObject host = otherHosts.getJSONObject(ii);
String address = host.getString(ADDRESS);
int port = host.getInt(PORT);
final int hostId = host.getInt(HOST_ID);
LOG.info("Leader provided address " + address + ":" + port);
InetSocketAddress hostAddr = new InetSocketAddress(address, port);
if (ii == 0) {
//Leader already has a socket
hostIds[ii] = hostId;
listeningAddresses[ii] = hostAddr;
hostSockets[ii] = socket;
cmbld.put(ii, response.getLeaderInfo());
continue;
}
// connect to all the peer hosts (except leader) and advertise our existence
SocketChannel hostSocket = connectToHost(hostAddr);
JSONObject hostInfo = publishHostId(hostAddr, hostSocket, skews, activeVersions);
hostIds[ii] = hostId;
hostSockets[ii] = hostSocket;
listeningAddresses[ii] = hostAddr;
cmbld.put(ii, hostInfo);
}
/*
* The max difference of clock skew cannot exceed MAX_CLOCKSKEW, and the number of
* active versions in the cluster cannot be more than 2.
*/
checkClockSkew(skews);
checkActiveVersions(activeVersions, m_acceptor.getVersionChecker().getVersionString());
/*
* Notify the leader that we connected to the entire cluster, it will then go
* and queue a txn for our agreement site to join the cluster
*/
ByteBuffer joinCompleteBuffer = ByteBuffer.allocate(1);
while (joinCompleteBuffer.hasRemaining()) {
hostSockets[0].write(joinCompleteBuffer);
}
/*
* Let host messenger know about the connections.
* It will init the agreement site and then we are done.
*/
m_joinHandler.notifyOfHosts( m_localHostId, hostIds, hostSockets, listeningAddresses, cmbld.build());
} catch (ClosedByInterruptException e) {
//This is how shutdown is done
}
}
private static void checkClockSkew(List<Long> skews)
{
long maxSkew = Collections.max(skews);
long minSkew = Collections.min(skews);
long overallSkew = maxSkew - minSkew;
if (maxSkew > 0 && minSkew > 0) {
overallSkew = maxSkew;
} else if (maxSkew < 0 && minSkew < 0) {
overallSkew = Math.abs(minSkew);
}
if (overallSkew > MAX_CLOCKSKEW) {
org.voltdb.VoltDB.crashLocalVoltDB("Clock skew is " + overallSkew +
" which is > than the " + MAX_CLOCKSKEW + " millisecond limit. Make sure NTP is running.", false, null);
} else if (overallSkew > CRITICAL_CLOCKSKEW) {
final String msg = "Clock skew is " + overallSkew +
" which is high. Ideally it should be sub-millisecond. Make sure NTP is running.";
hostLog.warn(msg);
consoleLog.warn(msg);
} else {
hostLog.info("Clock skew to across all nodes in the cluster is " + overallSkew);
}
}
private static void checkActiveVersions(Set<String> activeVersions, String localVersion) {
/*
* Limit the number of active versions to 2.
*/
if (activeVersions.size() > 2) {
String versions = "";
// get the list of non-local versions
for (String version : activeVersions) {
if (!version.equals(localVersion)) {
versions += version + ", ";
}
}
// trim the trailing comma + space
versions = versions.substring(0, versions.length() - 2);
org.voltdb.VoltDB.crashLocalVoltDB("Cluster already is running mixed voltdb versions (" + versions +").\n" +
"Adding version " + localVersion + " would add a third version.\n" +
"VoltDB hotfix support supports only two unique versions simulaniously.", false, null);
}
}
public void shutdown() throws InterruptedException {
if (m_selector != null) {
try {
m_selector.close();
} catch (IOException e) {
}
}
m_es.shutdownNow();
m_es.awaitTermination(356, TimeUnit.DAYS);
for (ServerSocketChannel ssc : m_listenerSockets) {
try {
ssc.close();
} catch (IOException e) {
}
}
m_listenerSockets.clear();
if (m_selector != null) {
try {
m_selector.close();
} catch (IOException e) {
}
m_selector = null;
}
}
int getLocalHostId() {
return m_localHostId;
}
}