Distributer.java example

Explorer
voltdb-master
/* This file is part of VoltDB.
 * Copyright (C) 2008-2017 VoltDB Inc.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with VoltDB.  If not, see <http://www.gnu.org/licenses/>.
 */

package org.voltdb.client;

import java.io.IOException;
import java.net.InetSocketAddress;
import java.net.UnknownHostException;
import java.nio.ByteBuffer;
import java.nio.channels.SocketChannel;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Random;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.Executors;
import java.util.concurrent.RejectedExecutionException;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.locks.LockSupport;

import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLEngine;
import javax.security.auth.Subject;

import org.cliffc_voltpatches.high_scale_lib.NonBlockingHashMap;
import org.json_voltpatches.JSONException;
import org.json_voltpatches.JSONObject;
import org.voltcore.network.CipherExecutor;
import org.voltcore.network.Connection;
import org.voltcore.network.QueueMonitor;
import org.voltcore.network.VoltNetworkPool;
import org.voltcore.network.VoltNetworkPool.IOStatsIntf;
import org.voltcore.network.VoltProtocolHandler;
import org.voltcore.utils.CoreUtils;
import org.voltcore.utils.Pair;
import org.voltcore.utils.ssl.SSLConfiguration;
import org.voltdb.ClientResponseImpl;
import org.voltdb.VoltTable;
import org.voltdb.client.ClientStatusListenerExt.DisconnectCause;
import org.voltdb.client.HashinatorLite.HashinatorLiteType;
import org.voltdb.common.Constants;

import com.google_voltpatches.common.base.Throwables;
import com.google_voltpatches.common.collect.ImmutableList;
import com.google_voltpatches.common.collect.ImmutableSet;
import com.google_voltpatches.common.collect.ImmutableSortedMap;
import com.google_voltpatches.common.collect.Maps;
import com.google_voltpatches.common.collect.Sets;

import jsr166y.ThreadLocalRandom;

/**
 *   De/multiplexes transactions across a cluster
 *
 *   It is safe to synchronized on an individual connection and then the distributer, but it is always unsafe
 *   to synchronized on the distributer and then an individual connection.
 */
class Distributer {

    static int RESUBSCRIPTION_DELAY_MS = Integer.getInteger("RESUBSCRIPTION_DELAY_MS", 10000);
    static final long PING_HANDLE = Long.MAX_VALUE;
    public static final Long ASYNC_TOPO_HANDLE = PING_HANDLE - 1;
    public static final Long ASYNC_PROC_HANDLE = PING_HANDLE - 2;
    static final long USE_DEFAULT_CLIENT_TIMEOUT = 0;
    static long PARTITION_KEYS_INFO_REFRESH_FREQUENCY = Long.getLong("PARTITION_KEYS_INFO_REFRESH_FREQUENCY", 1000);

    // handles used internally are negative and decrement for each call
    public final AtomicLong m_sysHandle = new AtomicLong(-1);

    // collection of connections to the cluster
    private final CopyOnWriteArrayList<NodeConnection> m_connections =
            new CopyOnWriteArrayList<>();

    private final ArrayList<ClientStatusListenerExt> m_listeners = new ArrayList<>();

    //Selector and connection handling, does all work in blocking selection thread
    private final VoltNetworkPool m_network;

    private final SSLContext m_sslContext;

    // Temporary until a distribution/affinity algorithm is written
    private int m_nextConnection = 0;

    private final boolean m_useMultipleThreads;
    private final boolean m_useClientAffinity;
    private final boolean m_sendReadsToReplicasBytDefaultIfCAEnabled;

    private static final class Procedure {
        final static int PARAMETER_NONE = -1;
        private final boolean multiPart;
        private final boolean readOnly;
        private final int partitionParameter;
        private final int partitionParameterType;
        private Procedure(boolean multiPart,
                boolean readOnly,
                int partitionParameter,
                int partitionParameterType) {
            this.multiPart = multiPart;
            this.readOnly = readOnly;
            this.partitionParameter = multiPart? PARAMETER_NONE : partitionParameter;
            this.partitionParameterType = multiPart ? PARAMETER_NONE : partitionParameterType;
        }
    }

    private final Map<Integer, NodeConnection> m_partitionMasters = new HashMap<>();
    private final Map<Integer, NodeConnection[]> m_partitionReplicas = new HashMap<>();
    private final Map<Integer, NodeConnection> m_hostIdToConnection = new HashMap<>();
    private final AtomicReference<ImmutableSortedMap<String, Procedure>> m_procedureInfo =
                                new AtomicReference<ImmutableSortedMap<String, Procedure>>();
    private final AtomicReference<ImmutableSet<Integer>> m_partitionKeys = new AtomicReference<ImmutableSet<Integer>>();
    private final AtomicLong m_lastPartitionKeyFetched = new AtomicLong(0);
    private final AtomicReference<ClientResponse> m_partitionUpdateStatus = new AtomicReference<ClientResponse>();

    //This is the instance of the Hashinator we picked from TOPO used only for client affinity.
    private HashinatorLite m_hashinator = null;
    //This is a global timeout that will be used if a per-procedure timeout is not provided with the procedure call.
    private final long m_procedureCallTimeoutNanos;
    private static final long MINIMUM_LONG_RUNNING_SYSTEM_CALL_TIMEOUT_MS = 30 * 60 * 1000; // 30 minutes
    private final long m_connectionResponseTimeoutNanos;
    private final Map<Integer, ClientAffinityStats> m_clientAffinityStats =
        new HashMap<>();

    public final RateLimiter m_rateLimiter = new RateLimiter();

    private final AtomicReference<ImmutableSet<Integer>> m_unconnectedHosts = new AtomicReference<ImmutableSet<Integer>>();
    private AtomicBoolean m_createConnectionUponTopoChangeInProgress = new AtomicBoolean(false);
    private boolean m_topologyChangeAware;

    //private final Timer m_timer;
    private final ScheduledExecutorService m_ex =
        Executors.newSingleThreadScheduledExecutor(
                CoreUtils.getThreadFactory("VoltDB Client Reaper Thread"));
    ScheduledFuture<?> m_timeoutReaperHandle;

    /**
     * Server's instances id. Unique for the cluster
     */
    private Object m_clusterInstanceId[];

    private String m_buildString;

    /*
     * The connection we have issued our subscriptions to. If the connection is lost
     * we will need to request subscription from a different node
     */
    private NodeConnection m_subscribedConnection = null;
    //Track if a request is pending so we don't accidentally handle a failed node twice
    private boolean m_subscriptionRequestPending = false;

    //Until catalog subscription is implemented, only fetch it once
    private boolean m_fetchedCatalog = false;

    /**
     * JAAS Authentication Subject
     */
    private final Subject m_subject;

    // executor service for ssl encryption/decryption, if ssl is enabled.
    private CipherExecutor m_cipherService;

    /**
     * Handles topology updates for client affinity
     */
    class TopoUpdateCallback implements ProcedureCallback {

        @Override
        public void clientCallback(ClientResponse clientResponse) throws Exception {
            if (clientResponse.getStatus() != ClientResponse.SUCCESS) {
                return;
            }
            try {
                synchronized (Distributer.this) {
                    VoltTable results[] = clientResponse.getResults();
                    if (results != null && results.length > 1) {
                        updateAffinityTopology(results);
                    }
                }
            }
            catch (Exception e) {
                e.printStackTrace();
            }
        }
    }

    /**
     * Handles partition updates for client affinity
     */
    class PartitionUpdateCallback implements ProcedureCallback {

        final CountDownLatch m_latch;

        PartitionUpdateCallback(CountDownLatch latch) {
            m_latch = latch;
        }

        @Override
        public void clientCallback(ClientResponse clientResponse) throws Exception {
            if (clientResponse.getStatus() == ClientResponse.SUCCESS) {
                VoltTable results[] = clientResponse.getResults();
                if (results != null && results.length > 0) {
                    updatePartitioning(results[0]);
                }
            }

            m_partitionUpdateStatus.set(clientResponse);

            if (m_latch != null) {
                m_latch.countDown();
            }
        }
    }
    /**
     * Handles @Subscribe response
     */
    class SubscribeCallback implements ProcedureCallback {

        @Override
        public void clientCallback(ClientResponse response) throws Exception {
            //Pre 4.1 clusers don't know about subscribe, don't stress over it.
            if (response.getStatusString() != null &&
                response.getStatusString().contains("@Subscribe was not found")) {
                synchronized (Distributer.this) {
                    m_subscriptionRequestPending = false;
                }
                return;
            }
            //Fast path subscribing retry if the connection was lost before getting a response
            if (response.getStatus() == ClientResponse.CONNECTION_LOST && !m_connections.isEmpty()) {
                subscribeToNewNode();
                return;
            } else if (response.getStatus() == ClientResponse.CONNECTION_LOST) {
                return;
            }

            //Slow path, god knows why it didn't succeed, server could be paused and in admin mode. Don't firehose attempts.
            if (response.getStatus() != ClientResponse.SUCCESS && !m_ex.isShutdown()) {
                //Retry on the off chance that it will work the Nth time, or work at a different node
                m_ex.schedule(new Runnable() {
                    @Override
                    public void run() {
                        try {
                            subscribeToNewNode();
                        } catch (Throwable t) {
                            t.printStackTrace();
                            Throwables.propagate(t);
                        }
                    }
                }, 2, TimeUnit.MINUTES);
                return;
            }
            //If success, the code in NodeConnection.stopping needs to know it has to handle selecting
            //a new node to for subscriptions, so set the pending request to false to let that code
            //know that the failure won't be handled in the callback
            synchronized (Distributer.this) {
                m_subscriptionRequestPending = false;
            }
        }
    }

    /**
     * Handles procedure updates for client affinity
     */
    class ProcUpdateCallback implements ProcedureCallback {

        @Override
        public void clientCallback(ClientResponse clientResponse) throws Exception {
            if (clientResponse.getStatus() != ClientResponse.SUCCESS) {
                return;
            }
            try {
                synchronized (Distributer.this) {
                    VoltTable results[] = clientResponse.getResults();
                    if (results != null && results.length == 1) {
                        VoltTable vt = results[0];
                        updateProcedurePartitioning(vt);
                    }
                    m_fetchedCatalog = true;
                }

            }
            catch (Exception e) {
                e.printStackTrace();
            }
        }
    }

    class CallExpiration implements Runnable {
        @Override
        public void run() {
            try {
                // make a threadsafe copy of all connections
                ArrayList<NodeConnection> connections = new ArrayList<>();
                synchronized (Distributer.this) {
                    connections.addAll(m_connections);
                }

                final long nowNanos = System.nanoTime();

                // for each connection
                for (final NodeConnection c : connections) {
                    // check for connection age
                    final long sinceLastResponse = Math.max(1, nowNanos - c.m_lastResponseTimeNanos);

                    // if outstanding ping and timeoutMS, close the connection
                    if (c.m_outstandingPing && (sinceLastResponse > m_connectionResponseTimeoutNanos)) {
                        // memoize why it's closing
                        c.m_closeCause = DisconnectCause.TIMEOUT;
                        // this should trigger NodeConnection.stopping(..)
                        c.m_connection.unregister();
                    }

                    // if 1/3 of the timeoutMS since last response, send a ping
                    if ((!c.m_outstandingPing) && (sinceLastResponse > (m_connectionResponseTimeoutNanos / 3))) {
                        c.sendPing();
                    }

                    // for each outstanding procedure
                    for (final Map.Entry<Long, CallbackBookeeping> e : c.m_callbacks.entrySet()) {
                        final long handle = e.getKey();
                        final CallbackBookeeping cb = e.getValue();

                        // if the timeout is expired, call the callback and remove the
                        // bookeeping data
                        final long deltaNanos = Math.max(1, nowNanos - cb.timestampNanos);
                        if (deltaNanos > cb.procedureTimeoutNanos) {

                            //For expected long operations don't use the default timeout
                            //unless it is > MINIMUM_LONG_RUNNING_SYSTEM_CALL_TIMEOUT_MS
                            final boolean isLongOp = isLongOp(cb.name);
                            if (isLongOp && (deltaNanos < TimeUnit.MILLISECONDS.toNanos(MINIMUM_LONG_RUNNING_SYSTEM_CALL_TIMEOUT_MS))) {
                                continue;
                            }

                            c.handleTimedoutCallback(handle, nowNanos);
                        }
                    }
                }
            } catch (Throwable t) {
                t.printStackTrace();
            }
        }
    }

    /*
     * Check if the proc name is a procedure that is expected to run long
     * Make the minimum timeoutMS for certain long running system procedures
     * higher than the default 2m.
     * you can still set the default timeoutMS higher than even this value
     *
     */
    private static boolean isLongOp(String procName) {
        if (procName.startsWith("@")) {
            if (procName.equals("@UpdateApplicationCatalog") || procName.equals("@SnapshotSave")) {
                return true;
            }
        }
        return false;
    }

    class CallbackBookeeping {
        public CallbackBookeeping(long timestampNanos, ProcedureCallback callback, String name, long timeoutNanos, boolean ignoreBackpressure) {
            assert(callback != null);
            this.timestampNanos = timestampNanos;
            this.callback = callback;
            this.name = name;
            this.procedureTimeoutNanos = timeoutNanos;
            this.ignoreBackpressure = ignoreBackpressure;
        }
        long timestampNanos;
        //Timeout in ms 0 means use conenction specified procedure timeoutMS.
        final long procedureTimeoutNanos;
        ProcedureCallback callback;
        String name;
        boolean ignoreBackpressure;
    }

    class NodeConnection extends VoltProtocolHandler implements org.voltcore.network.QueueMonitor {
        private final AtomicInteger m_callbacksToInvoke = new AtomicInteger(0);
        private final ConcurrentMap<Long, CallbackBookeeping> m_callbacks = new ConcurrentHashMap<>();
        private final NonBlockingHashMap<String, ClientStats> m_stats = new NonBlockingHashMap<>();
        private Connection m_connection;
        private volatile boolean m_isConnected = true;

        volatile long m_lastResponseTimeNanos = System.nanoTime();
        boolean m_outstandingPing = false;
        ClientStatusListenerExt.DisconnectCause m_closeCause = DisconnectCause.CONNECTION_CLOSED;

        public NodeConnection(long ids[]) {}

        /*
         * NodeConnection uses ignoreBackpressure to get rate limiter to not
         * apply any permit tracking or rate limits to transactions that should
         * never be rejected such as those submitted from within a callback thread or
         * generated internally
         */
        public void createWork(final long nowNanos, long handle, String name, ByteBuffer c,
                ProcedureCallback callback, boolean ignoreBackpressure, long timeoutNanos) {
            assert(callback != null);

            //How long from the starting point in time to wait to get this stuff done
            timeoutNanos = (timeoutNanos == Distributer.USE_DEFAULT_CLIENT_TIMEOUT) ? m_procedureCallTimeoutNanos : timeoutNanos;

            //Trigger the timeout at this point in time no matter what
            final long timeoutTime = nowNanos + timeoutNanos;

            //What was the time after the rate limiter returned
            //Will be the same as timeoutNanos if it didn't block
            long afterRateLimitNanos = 0;

            /*
             * Do rate limiting or check for max outstanding related backpressure in
             * the rate limiter which can block. If it blocks we can still get a timeout
             * exception to give prompt timeouts
             */
            try {
                afterRateLimitNanos = m_rateLimiter.sendTxnWithOptionalBlockAndReturnCurrentTime(
                        nowNanos, timeoutNanos, ignoreBackpressure);
            } catch (TimeoutException e) {
                /*
                 * It's possible we need to timeout because it took too long to get
                 * the transaction out on the wire due to max outstanding
                 */
                final long deltaNanos = Math.max(1, System.nanoTime() - nowNanos);
                    invokeCallbackWithTimeout(name, callback, deltaNanos, afterRateLimitNanos,  timeoutNanos, handle, ignoreBackpressure);
                return;
            }

            assert(m_callbacks.containsKey(handle) == false);

            //Drain needs to know when all callbacks have been invoked
            final int callbacksToInvoke = m_callbacksToInvoke.incrementAndGet();
            assert(callbacksToInvoke >= 0);

            //Optimistically submit the task
            m_callbacks.put(handle, new CallbackBookeeping(nowNanos, callback, name, timeoutNanos, ignoreBackpressure));

            //Schedule the timeout to fire relative to the amount of time
            //spent getting to this point. Might fire immediately
            //some of the time, but that is fine
            final long timeoutRemaining = timeoutTime - afterRateLimitNanos;

            //Schedule an individual timeout if necessary
            //If it is a long op, don't bother scheduling a discrete timeout
            if (timeoutNanos < TimeUnit.SECONDS.toNanos(1) && !isLongOp(name)) {
                submitDiscreteTimeoutTask(handle, Math.max(0, timeoutRemaining));
            }

            //Check for disconnect
            if (!m_isConnected) {
                //Check if the disconnect or expiration already handled the callback
                if (m_callbacks.remove(handle) == null) {
                    return;
                }
                final ClientResponse r = new ClientResponseImpl(
                        ClientResponse.CONNECTION_LOST, new VoltTable[0],
                        "Connection to database host (" + m_connection.getHostnameAndIPAndPort() +
                ") was lost before a response was received");
                try {
                    callback.clientCallback(r);
                } catch (Exception e) {
                    uncaughtException(callback, r, e);
                }

                //Drain needs to know when all callbacks have been invoked
                final int remainingToInvoke = m_callbacksToInvoke.decrementAndGet();
                assert(remainingToInvoke >= 0);

                //for bookkeeping, but it feels dishonest to call this here
                m_rateLimiter.transactionResponseReceived(nowNanos, -1, ignoreBackpressure);
                return;
            } else {
                m_connection.writeStream().enqueue(c);
            }
        }

        /*
         * For high precision timeouts, submit a discrete task to a scheduled
         * executor service to time out the transaction. The timeout task
         * when run checks if the task is still present in the concurrent map
         * of tasks and removes it. If it wins the race to remove the map
         * then the transaction will be timed out even if a response is received
         * at the same time.
         *
         * This will race with the periodic task that checks lower resolution timeouts
         * and it is fine, the concurrent map makes sure each callback is handled exactly once
         */
        void submitDiscreteTimeoutTask(final long handle, long timeoutNanos) {
            m_ex.schedule(new Runnable() {
                @Override
                public void run() {
                    handleTimedoutCallback(handle, System.nanoTime());
                }
            }, timeoutNanos, TimeUnit.NANOSECONDS);
        }

        /*
         * Factor out the boilerplate involved in checking whether a timed out callback
         * still exists and needs to be invoked, or has already been handled by another thread
         */
        void handleTimedoutCallback(long handle, long nowNanos) {
            //Callback doesn't have to be there, it may have already
            //received a response or been expired by the periodic expiration task, or a discrete expiration task
            final CallbackBookeeping cb = m_callbacks.remove(handle);

            //It was handled during the race
            if (cb == null) {
                return;
            }

            final long deltaNanos = Math.max(1, nowNanos - cb.timestampNanos);

            invokeCallbackWithTimeout(cb.name, cb.callback, deltaNanos, nowNanos, cb.procedureTimeoutNanos, handle, cb.ignoreBackpressure);
        }

        /*
         * Factor out the boilerplate involved in invoking a callback with a timeout response
         */
        void invokeCallbackWithTimeout(String procName,
                                       ProcedureCallback callback,
                                       long deltaNanos,
                                       long nowNanos,
                                       long timeoutNanos,
                                       long handle,
                                       boolean ignoreBackpressure) {
            ClientResponseImpl r = new ClientResponseImpl(
                    ClientResponse.CONNECTION_TIMEOUT,
                    ClientResponse.UNINITIALIZED_APP_STATUS_CODE,
                    "",
                    new VoltTable[0],
                    String.format("No response received in the allotted time (set to %d ms).",
                            TimeUnit.NANOSECONDS.toMillis(timeoutNanos)));
            r.setClientHandle(handle);
            r.setClientRoundtrip(deltaNanos);
            r.setClusterRoundtrip((int)TimeUnit.NANOSECONDS.toMillis(deltaNanos));
            try {
                callback.clientCallback(r);
            } catch (Throwable e1) {
                uncaughtException( callback, r, e1);
            }

            //Drain needs to know when all callbacks have been invoked
            final int remainingToInvoke = m_callbacksToInvoke.decrementAndGet();
            assert(remainingToInvoke >= 0);

            m_rateLimiter.transactionResponseReceived(nowNanos, -1, ignoreBackpressure);
            updateStatsForTimeout(procName, r.getClientRoundtripNanos(), r.getClusterRoundtrip());
        }

        void sendPing() {
            ProcedureInvocation invocation = new ProcedureInvocation(PING_HANDLE, "@Ping");
            ByteBuffer buf = ByteBuffer.allocate(4 + invocation.getSerializedSize());
            buf.putInt(buf.capacity() - 4);
            try {
                invocation.flattenToBuffer(buf);
                buf.flip();
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
            m_connection.writeStream().enqueue(buf);
            m_outstandingPing = true;
        }

        private void updateStatsForTimeout(
                final String procName,
                final long roundTripNanos,
                final int clusterRoundTrip) {
            m_connection.queueTask(new Runnable() {
                @Override
                public void run() {
                    updateStats(procName, roundTripNanos, clusterRoundTrip, false, false, true);
                }
            });
        }

        /**
         * Update the procedures statistics
         * @param procName Name of procedure being updated
         * @param clusterRoundTrip round trip measured within the VoltDB cluster
         * @param abort true of the procedure was aborted
         * @param failure true if the procedure failed
         */
        private void updateStats(
                String procName,
                long roundTripNanos,
                int clusterRoundTrip,
                boolean abort,
                boolean failure,
                boolean timeout) {
            ClientStats stats = m_stats.get(procName);
            if (stats == null) {
                stats = new ClientStats();
                stats.m_connectionId = connectionId();
                stats.m_hostname = m_connection.getHostnameOrIP();
                stats.m_port = m_connection.getRemotePort();
                stats.m_procName = procName;
                stats.m_startTS = System.currentTimeMillis();
                stats.m_endTS = Long.MIN_VALUE;
                m_stats.put(procName, stats);
            }
            stats.update(roundTripNanos, clusterRoundTrip, abort, failure, timeout);
        }

        @Override
        public void handleMessage(ByteBuffer buf, Connection c) {
            long nowNanos = System.nanoTime();
            ClientResponseImpl response = new ClientResponseImpl();
            try {
                response.initFromBuffer(buf);
            } catch (IOException e1) {
                // TODO Auto-generated catch block
                e1.printStackTrace();
            }

            // track the timestamp of the most recent read on this connection
            m_lastResponseTimeNanos = nowNanos;

            final long handle = response.getClientHandle();

            // handle ping response and get out
            if (handle == PING_HANDLE) {
                m_outstandingPing = false;
                return;
            } else if (handle == ASYNC_TOPO_HANDLE) {
                /*
                 * Really didn't want to add this block because it is not DRY
                 * for the exception handling, but trying to set + reset the async topo callback
                 * turned out to be pretty challenging
                 */
                ProcedureCallback cb = new TopoUpdateCallback();
                try {
                    cb.clientCallback(response);
                } catch (Exception e) {
                    uncaughtException(cb, response, e);
                }

                return;
            } else if (handle == ASYNC_PROC_HANDLE) {
                ProcedureCallback cb = new ProcUpdateCallback();
                try {
                    cb.clientCallback(response);
                } catch (Exception e) {
                    uncaughtException(cb, response, e);
                }
                return;
            }

            //Race with expiration thread to be the first to remove the callback
            //from the map and process it
            final CallbackBookeeping stuff = m_callbacks.remove(response.getClientHandle());

            // presumably (hopefully) this is a response for a timed-out message
            if (stuff == null) {
                // also ignore internal (topology and procedure) calls
                if (handle >= 0) {
                    // notify any listeners of the late response
                    for (ClientStatusListenerExt listener : m_listeners) {
                        listener.lateProcedureResponse(
                                response,
                                m_connection.getHostnameOrIP(),
                                m_connection.getRemotePort());
                    }
                }
            }
            // handle a proper callback
            else {
                final long callTimeNanos = stuff.timestampNanos;
                final long deltaNanos = Math.max(1, nowNanos - callTimeNanos);
                final ProcedureCallback cb = stuff.callback;
                assert(cb != null);
                final byte status = response.getStatus();
                boolean abort = false;
                boolean error = false;
                if (status == ClientResponse.USER_ABORT || status == ClientResponse.GRACEFUL_FAILURE) {
                    abort = true;
                } else if (status != ClientResponse.SUCCESS) {
                    error = true;
                }

                int clusterRoundTrip = response.getClusterRoundtrip();
                m_rateLimiter.transactionResponseReceived(nowNanos, clusterRoundTrip, stuff.ignoreBackpressure);
                updateStats(stuff.name, deltaNanos, clusterRoundTrip, abort, error, false);
                response.setClientRoundtrip(deltaNanos);
                assert(response.getHashes() == null) : "A determinism hash snuck into the client wire protocol";
                try {
                    cb.clientCallback(response);
                } catch (Exception e) {
                    uncaughtException(cb, response, e);
                }

                //Drain needs to know when all callbacks have been invoked
                final int remainingToInvoke = m_callbacksToInvoke.decrementAndGet();
                assert(remainingToInvoke >= 0);
            }
        }

        @Override
        public int getMaxRead() {
            return Integer.MAX_VALUE;
        }

        public boolean hadBackPressure() {
            return m_connection.writeStream().hadBackPressure();
        }


        @Override
        public void stopping(Connection c) {
            super.stopping(c);
            m_isConnected = false;
            //Prevent queueing of new work to this connection
            synchronized (Distributer.this) {
                /*
                 * Repair all cluster topology data with the node connection removed
                 */
                Iterator<Map.Entry<Integer, NodeConnection>> i = m_partitionMasters.entrySet().iterator();
                while (i.hasNext()) {
                    Map.Entry<Integer, NodeConnection> entry = i.next();
                    if (entry.getValue() == this) {
                        i.remove();
                    }
                }

                i = m_hostIdToConnection.entrySet().iterator();
                while (i.hasNext()) {
                    Map.Entry<Integer, NodeConnection> entry = i.next();
                    if (entry.getValue() == this) {
                        i.remove();
                    }
                }

                Iterator<Map.Entry<Integer, NodeConnection[]>> i2 = m_partitionReplicas.entrySet().iterator();
                List<Pair<Integer, NodeConnection[]>> entriesToRewrite = new ArrayList<>();
                while (i2.hasNext()) {
                    Map.Entry<Integer, NodeConnection[]> entry = i2.next();
                    for (NodeConnection nc : entry.getValue()) {
                        if (nc == this) {
                            entriesToRewrite.add(Pair.of(entry.getKey(), entry.getValue()));
                        }
                    }
                }

                for (Pair<Integer, NodeConnection[]> entry : entriesToRewrite) {
                    m_partitionReplicas.remove(entry.getFirst());
                    NodeConnection survivors[] = new NodeConnection[entry.getSecond().length - 1];
                    if (survivors.length == 0) {
                        break;
                    }
                    int zz = 0;
                    for (int ii = 0; ii < entry.getSecond().length; ii++) {
                        if (entry.getSecond()[ii] != this) {
                            survivors[zz++] = entry.getSecond()[ii];
                        }
                    }
                    m_partitionReplicas.put(entry.getFirst(), survivors);
                }

                m_connections.remove(this);
                //Notify listeners that a connection has been lost
                for (ClientStatusListenerExt s : m_listeners) {
                    s.connectionLost(
                            m_connection.getHostnameOrIP(),
                            m_connection.getRemotePort(),
                            m_connections.size(),
                            m_closeCause);
                }

                /*
                 * Deal with the fact that this may have been the connection that subscriptions were issued
                 * to. If a subscription request was pending, don't handle selecting a new node here
                 * let the callback see the failure and retry
                 */
                if (m_useClientAffinity &&
                    m_subscribedConnection == this &&
                    m_subscriptionRequestPending == false &&
                    !m_ex.isShutdown()) {
                    //Don't subscribe to a new node immediately
                    //to somewhat prevent a thundering herd
                    try {
                        m_ex.schedule(new Runnable() {
                            @Override
                            public void run() {
                                subscribeToNewNode();
                            }
                        }, new Random().nextInt(RESUBSCRIPTION_DELAY_MS),
                                TimeUnit.MILLISECONDS);
                    } catch (RejectedExecutionException ree) {
                        // this is for race if m_ex shuts down in the middle of schedule
                        return;
                    }

                }
            }

            //Invoke callbacks for all queued invocations with a failure response
            final ClientResponse r =
                new ClientResponseImpl(
                        ClientResponse.CONNECTION_LOST, new VoltTable[0],
                        "Connection to database host (" + m_connection.getHostnameAndIPAndPort() +
                ") was lost before a response was received");
            for (Map.Entry<Long, CallbackBookeeping> e : m_callbacks.entrySet()) {
                //Check for race with other threads
                if (m_callbacks.remove(e.getKey()) == null) {
                    continue;
                }
                final CallbackBookeeping callBk = e.getValue();
                try {
                    callBk.callback.clientCallback(r);
                }
                catch (Exception ex) {
                    uncaughtException(callBk.callback, r, ex);
                }

                //Drain needs to know when all callbacks have been invoked
                final int remainingToInvoke = m_callbacksToInvoke.decrementAndGet();
                assert(remainingToInvoke >= 0);

                m_rateLimiter.transactionResponseReceived(System.nanoTime(), -1, callBk.ignoreBackpressure);
            }
        }

        @Override
        public Runnable offBackPressure() {
            return new Runnable() {
                @Override
                public void run() {
                    /*
                     * Synchronization on Distributer.this is critical to ensure that queue
                     * does not report backpressure AFTER the write stream reports that backpressure
                     * has ended thus resulting in a lost wakeup.
                     */
                    synchronized (Distributer.this) {
                        for (final ClientStatusListenerExt csl : m_listeners) {
                            csl.backpressure(false);
                        }
                    }
                }
            };
        }

        @Override
        public Runnable onBackPressure() {
            return null;
        }

        @Override
        public QueueMonitor writestreamMonitor() {
            return this;
        }

        private int m_queuedBytes = 0;
        private final int m_maxQueuedBytes = 262144;

        @Override
        public boolean queue(int bytes) {
            m_queuedBytes += bytes;
            if (m_queuedBytes > m_maxQueuedBytes) {
                return true;
            }
            return false;
        }

        public InetSocketAddress getSocketAddress() {
            return m_connection.getRemoteSocketAddress();
        }
    }

    void drain() throws InterruptedException {
        boolean more;
        long sleep = 500;
        do {
            more = false;
            for (NodeConnection cxn : m_connections) {
                more = more || cxn.m_callbacksToInvoke.get() > 0;
            }
            /*
             * Back off to spinning at five millis. Try and get drain to be a little
             * more prompt. Spinning sucks!
             */
            if (more) {
                if (Thread.interrupted()) {
                    throw new InterruptedException();
                }
                LockSupport.parkNanos(TimeUnit.MICROSECONDS.toNanos(sleep));
                if (Thread.interrupted()) {
                    throw new InterruptedException();
                }
                if (sleep < 5000) {
                    sleep += 500;
                }
            }
        } while(more);
    }

    Distributer() {
        this( false,
                ClientConfig.DEFAULT_PROCEDURE_TIMOUT_NANOS,
                ClientConfig.DEFAULT_CONNECTION_TIMOUT_MS,
                false, false, null, null);
    }

    Distributer(
            boolean useMultipleThreads,
            long procedureCallTimeoutNanos,
            long connectionResponseTimeoutMS,
            boolean useClientAffinity,
            boolean sendReadsToReplicasBytDefault,
            Subject subject,
            SSLContext sslContext) {
        m_useMultipleThreads = useMultipleThreads;
        m_sslContext = sslContext;
        if (m_sslContext != null) {
            m_cipherService = CipherExecutor.CLIENT;
            m_cipherService.startup();
        } else {
            m_cipherService = null;
        }
        m_network = new VoltNetworkPool(
                m_useMultipleThreads ? Math.max(1, CoreUtils.availableProcessors() / 4 ) : 1,
                1, null, "Client");
        m_network.start();
        m_procedureCallTimeoutNanos= procedureCallTimeoutNanos;
        m_connectionResponseTimeoutNanos = TimeUnit.MILLISECONDS.toNanos(connectionResponseTimeoutMS);
        m_useClientAffinity = useClientAffinity;
        m_sendReadsToReplicasBytDefaultIfCAEnabled = sendReadsToReplicasBytDefault;

        // schedule the task that looks for timed-out proc calls and connections
        m_timeoutReaperHandle = m_ex.scheduleAtFixedRate(new CallExpiration(), 1, 1, TimeUnit.SECONDS);
        m_subject = subject;
    }

    void createConnection(String host, String program, String password, int port, ClientAuthScheme scheme)
    throws UnknownHostException, IOException
    {
        byte hashedPassword[] = ConnectionUtil.getHashedPassword(scheme, password);
        createConnectionWithHashedCredentials(host, program, hashedPassword, port, scheme);
    }

    void createConnectionWithHashedCredentials(String host, String program, byte[] hashedPassword, int port, ClientAuthScheme scheme)
    throws UnknownHostException, IOException
    {
        SSLEngine sslEngine = null;

        if (m_sslContext != null) {
            sslEngine = m_sslContext.createSSLEngine("client", port);
            sslEngine.setUseClientMode(true);

            Set<String> enabled = ImmutableSet.copyOf(sslEngine.getEnabledCipherSuites());
            Set<String> intersection = Sets.intersection(SSLConfiguration.GCM_CIPHERS, enabled);
            if (intersection.isEmpty()) {
                intersection = Sets.intersection(SSLConfiguration.PREFERRED_CIPHERS, enabled);
            }
            if (intersection.isEmpty()) {
                intersection = enabled;
            }
            sslEngine.setEnabledCipherSuites(intersection.toArray(new String[0]));
        }

        final Object socketChannelAndInstanceIdAndBuildString[] =
            ConnectionUtil.getAuthenticatedConnection(host, program, hashedPassword, port, m_subject, scheme, sslEngine);
        final SocketChannel aChannel = (SocketChannel)socketChannelAndInstanceIdAndBuildString[0];
        final long instanceIdWhichIsTimestampAndLeaderIp[] = (long[])socketChannelAndInstanceIdAndBuildString[1];
        final int hostId = (int)instanceIdWhichIsTimestampAndLeaderIp[0];

        NodeConnection cxn = new NodeConnection(instanceIdWhichIsTimestampAndLeaderIp);
        Connection c = null;
        try {
            if (aChannel != null) {
                c = m_network.registerChannel(aChannel, cxn, m_cipherService, sslEngine);
            }
        }
        catch (Exception e) {
            // Need to clean up the socket if there was any failure
            try {
                aChannel.close();
            } catch (IOException e1) {
                //Don't care connection is already lost anyways
            }
            Throwables.propagate(e);
        }
        cxn.m_connection = c;

        synchronized (this) {

            // If there are no connections, discard any previous connection ids and allow the client
            // to connect to a new cluster.
            // Careful, this is slightly less safe than the previous behavior.
            if (m_connections.size() == 0) {
                m_clusterInstanceId = null;
            }

            if (m_clusterInstanceId == null) {
                long timestamp = instanceIdWhichIsTimestampAndLeaderIp[2];
                int addr = (int)instanceIdWhichIsTimestampAndLeaderIp[3];
                m_clusterInstanceId = new Object[] { timestamp, addr };
            } else {
                if (!(((Long)m_clusterInstanceId[0]).longValue() == instanceIdWhichIsTimestampAndLeaderIp[2]) ||
                        !(((Integer)m_clusterInstanceId[1]).longValue() == instanceIdWhichIsTimestampAndLeaderIp[3])) {
                    // clean up the pre-registered voltnetwork connection/channel
                    c.unregister();
                    throw new IOException(
                            "Cluster instance id mismatch. Current is " + m_clusterInstanceId[0] + "," + m_clusterInstanceId[1] +
                            " and server's was " + instanceIdWhichIsTimestampAndLeaderIp[2] + "," + instanceIdWhichIsTimestampAndLeaderIp[3]);
                }
            }
            m_buildString = (String)socketChannelAndInstanceIdAndBuildString[2];

            m_connections.add(cxn);
        }

        if (m_useClientAffinity) {
            synchronized (this) {
                m_hostIdToConnection.put(hostId, cxn);
            }

            if (m_subscribedConnection == null) {
                subscribeToNewNode();
            }
        }
    }

    /*
     * Subscribe to receive async updates on a new node connection. This will set m_subscribed
     * connection to the provided connection.
     *
     * If we are subscribing to a new connection on node failure this will also fetch the topology post node
     * failure. If the cluster hasn't finished resolving the failure it is fine, we will get the new topo through\
     */
    private void subscribeToNewNode() {
        //Technically necessary to synchronize for safe publication of this store
        NodeConnection cxn = null;
        synchronized (Distributer.this) {
            m_subscribedConnection = null;
            if (!m_connections.isEmpty()) {
                cxn = m_connections.get(new Random().nextInt(m_connections.size()));
                m_subscriptionRequestPending = true;
                m_subscribedConnection = cxn;
            } else {
                return;
            }
        }
        try {

            //Subscribe to topology updates before retrieving the current topo
            //so there isn't potential for lost updates
            ProcedureInvocation spi = new ProcedureInvocation(m_sysHandle.getAndDecrement(), "@Subscribe", "TOPOLOGY");
            cxn.createWork(System.nanoTime(),
                    spi.getHandle(),
                    spi.getProcName(),
                    serializeSPI(spi),
                    new SubscribeCallback(),
                    true,
                    USE_DEFAULT_CLIENT_TIMEOUT);

            spi = new ProcedureInvocation(m_sysHandle.getAndDecrement(), "@Statistics", "TOPO", 0);
            //The handle is specific to topology updates and has special cased handling
            cxn.createWork(System.nanoTime(),
                    spi.getHandle(),
                    spi.getProcName(),
                    serializeSPI(spi),
                    new TopoUpdateCallback(),
                    true,
                    USE_DEFAULT_CLIENT_TIMEOUT);

            //Don't need to retrieve procedure updates every time we do a new subscription
            //since catalog changes aren't correlated with node failure the same way topo is
            if (!m_fetchedCatalog) {
                spi = new ProcedureInvocation(m_sysHandle.getAndDecrement(), "@SystemCatalog", "PROCEDURES");
                //The handle is specific to procedure updates and has special cased handling
                cxn.createWork(System.nanoTime(),
                        spi.getHandle(),
                        spi.getProcName(),
                        serializeSPI(spi),
                        new ProcUpdateCallback(),
                        true,
                        USE_DEFAULT_CLIENT_TIMEOUT);
            }

            //Partition key update
            refreshPartitionKeys(true);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    /**
     * Queue invocation on first node connection without backpressure. If there is none with without backpressure
     * then return false and don't queue the invocation
     * @param invocation
     * @param cb
     * @param ignoreBackpressure If true the invocation will be queued even if there is backpressure
     * @param nowNanos Current time in nanoseconds using System.nanoTime
     * @param timeoutNanos nanoseconds from nowNanos where timeout should fire
     * @return True if the message was queued and false if the message was not queued due to backpressure
     * @throws NoConnectionsException
     */
    boolean queue(
            ProcedureInvocation invocation,
            ProcedureCallback cb,
            final boolean ignoreBackpressure, final long nowNanos, final long timeoutNanos)
            throws NoConnectionsException {
        assert(invocation != null);
        assert(cb != null);

        NodeConnection cxn = null;
        boolean backpressure = true;

        /*
         * Synchronization is necessary to ensure that m_connections is not modified
         * as well as to ensure that backpressure is reported correctly
         */
        synchronized (this) {
            final int totalConnections = m_connections.size();

            if (totalConnections == 0) {
                throw new NoConnectionsException("No connections.");
            }

            /*
             * Check if the master for the partition is known. No back pressure check to ensure correct
             * routing, but backpressure will be managed anyways. This is where we guess partition based on client
             * affinity and known topology (hashinator initialized).
             */
            if (m_useClientAffinity && (m_hashinator != null)) {
                final ImmutableSortedMap<String, Procedure> procedures = m_procedureInfo.get();
                Procedure procedureInfo = null;
                if (procedures != null) {
                    procedureInfo = procedures.get(invocation.getProcName());
                }
                Integer hashedPartition = -1;

                if (procedureInfo != null) {
                    hashedPartition = Constants.MP_INIT_PID;
                    if (( ! procedureInfo.multiPart) &&
                        // User may have passed too few parameters to allow dispatching.
                        // Avoid an indexing error here to fall through to the proper ProcCallException.
                            (procedureInfo.partitionParameter < invocation.getPassedParamCount())) {
                        hashedPartition = m_hashinator.getHashedPartitionForParameter(
                                procedureInfo.partitionParameterType,
                                invocation.getPartitionParamValue(procedureInfo.partitionParameter));
                    }
                    /*
                     * If the procedure is read only and single part and the user wants it, load balance across replicas
                     * This is probably slower for SAFE consistency.
                     */
                    if (!procedureInfo.multiPart && procedureInfo.readOnly && m_sendReadsToReplicasBytDefaultIfCAEnabled) {
                        NodeConnection partitionReplicas[] = m_partitionReplicas.get(hashedPartition);
                        if (partitionReplicas != null && partitionReplicas.length > 0) {
                            cxn = partitionReplicas[ThreadLocalRandom.current().nextInt(partitionReplicas.length)];
                            if (cxn.hadBackPressure()) {
                                //See if there is one without backpressure, make sure it's still connected
                                for (NodeConnection nc : partitionReplicas) {
                                    if (!nc.hadBackPressure() && nc.m_isConnected) {
                                        cxn = nc;
                                        break;
                                    }
                                }
                            }
                            if (!cxn.hadBackPressure() || ignoreBackpressure) {
                                backpressure = false;
                            }
                        }
                    } else {
                        /*
                         * For writes or SAFE reads, this is the best way to go
                         */
                        cxn = m_partitionMasters.get(hashedPartition);
                        if (cxn != null && !cxn.hadBackPressure() || ignoreBackpressure) {
                            backpressure = false;
                        }
                    }
                }
                if (cxn != null && !cxn.m_isConnected) {
                    // Would be nice to log something here
                    // Client affinity picked a connection that was actually disconnected.  Reset to null
                    // and let the round-robin choice pick a connection
                    cxn = null;
                }
                ClientAffinityStats stats = m_clientAffinityStats.get(hashedPartition);
                if (stats == null) {
                    stats = new ClientAffinityStats(hashedPartition, 0, 0, 0, 0);
                    m_clientAffinityStats.put(hashedPartition, stats);
                }
                if (cxn != null) {
                    if (procedureInfo != null && procedureInfo.readOnly) {
                        stats.addAffinityRead();
                    }
                    else {
                        stats.addAffinityWrite();
                    }
                }
                // account these here because we lose the partition ID and procedure info once we
                // bust out of this scope.
                else {
                    if (procedureInfo != null && procedureInfo.readOnly) {
                        stats.addRrRead();
                    }
                    else {
                        stats.addRrWrite();
                    }
                }
            }
            if (cxn == null) {
                for (int i=0; i < totalConnections; ++i) {
                    cxn = m_connections.get(Math.abs(++m_nextConnection % totalConnections));
                    if (!cxn.hadBackPressure() || ignoreBackpressure) {
                        // serialize and queue the invocation
                        backpressure = false;
                        break;
                    }
                }
            }

            if (backpressure) {
                cxn = null;
                for (ClientStatusListenerExt s : m_listeners) {
                    s.backpressure(true);
                }
            }
        }

        /*
         * Do the heavy weight serialization outside the synchronized block.
         * createWork synchronizes on an individual connection which allows for more concurrency
         */
        if (cxn != null) {
            ByteBuffer buf = null;
            try {
                buf = serializeSPI(invocation);
            } catch (Exception e) {
                Throwables.propagate(e);
            }
            cxn.createWork(nowNanos, invocation.getHandle(), invocation.getProcName(), buf, cb, ignoreBackpressure, timeoutNanos);
        }
        if (m_topologyChangeAware) {
            createConnectionsUponTopologyChange();
        }
        return !backpressure;
    }

    /**
     * Shutdown the VoltNetwork allowing the Ports to close and free resources
     * like memory pools
     * @throws InterruptedException
     */
    final void shutdown() throws InterruptedException {
        // stop the old proc call reaper
        m_timeoutReaperHandle.cancel(false);
        m_ex.shutdown();
        if (CoreUtils.isJunitTest()) {
            m_ex.awaitTermination(1, TimeUnit.SECONDS);
        } else {
            m_ex.awaitTermination(365, TimeUnit.DAYS);
        }

        m_network.shutdown();
        if (m_cipherService != null) {
            m_cipherService.shutdown();
            m_cipherService = null;
        }
    }

    void uncaughtException(ProcedureCallback cb, ClientResponse r, Throwable t) {
        boolean handledByClient = false;
        for (ClientStatusListenerExt csl : m_listeners) {
            if (csl instanceof ClientImpl.InternalClientStatusListener) {
                continue;
            }
            try {
                csl.uncaughtException(cb, r, t);
                handledByClient = true;
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
        if (!handledByClient) {
            t.printStackTrace();
        }
    }

    synchronized void addClientStatusListener(ClientStatusListenerExt listener) {
        if (!m_listeners.contains(listener)) {
            m_listeners.add(listener);
        }
    }

    synchronized boolean removeClientStatusListener(ClientStatusListenerExt listener) {
        return m_listeners.remove(listener);
    }

    ClientStatsContext createStatsContext() {
        return new ClientStatsContext(this, getStatsSnapshot(), getIOStatsSnapshot(),
                getAffinityStatsSnapshot());
    }

    Map<Long, Map<String, ClientStats>> getStatsSnapshot() {
        Map<Long, Map<String, ClientStats>> retval =
                new TreeMap<>();

            for (NodeConnection conn : m_connections) {
                Map<String, ClientStats> connMap = new TreeMap<>();
                for (Entry<String, ClientStats> e : conn.m_stats.entrySet()) {
                    connMap.put(e.getKey(), (ClientStats) e.getValue().clone());
                }
                retval.put(conn.connectionId(), connMap);
            }


        return retval;
    }

    Map<Long, ClientIOStats> getIOStatsSnapshot() {
        Map<Long, ClientIOStats> retval = new TreeMap<>();

        Map<Long, Pair<String, long[]>> ioStats;
        try {
            ioStats = m_network.getIOStats(false, ImmutableList.<IOStatsIntf>of());
        } catch (Exception e) {
            return null;
        }

        for (NodeConnection conn : m_connections) {
            Pair<String, long[]> perConnIOStats = ioStats.get(conn.connectionId());
            if (perConnIOStats == null) {
                continue;
            }

            long read = perConnIOStats.getSecond()[0];
            long write = perConnIOStats.getSecond()[2];

            ClientIOStats cios = new ClientIOStats(conn.connectionId(), read, write);
            retval.put(conn.connectionId(), cios);
        }

        return retval;
    }

    Map<Integer, ClientAffinityStats> getAffinityStatsSnapshot()
    {
        Map<Integer, ClientAffinityStats> retval = new HashMap<>();
        // these get modified under this lock in queue()
        synchronized(this) {
            for (Entry<Integer, ClientAffinityStats> e : m_clientAffinityStats.entrySet()) {
                retval.put(e.getKey(), (ClientAffinityStats)e.getValue().clone());
            }
        }
        return retval;
    }

    public synchronized Object[] getInstanceId() {
        return m_clusterInstanceId;
    }

    /**
     * Not exposed to users for the moment.
     */
    public synchronized void resetInstanceId() {
        m_clusterInstanceId = null;
    }

    public String getBuildString() {
        return m_buildString;
    }

    public List<Long> getThreadIds() {
        return m_network.getThreadIds();
    }

    public List<InetSocketAddress> getConnectedHostList() {
        ArrayList<InetSocketAddress> addressList = new ArrayList<>();
        for (NodeConnection conn : m_connections) {
            addressList.add(conn.getSocketAddress());
        }
        return Collections.unmodifiableList(addressList);
    }

    public Map<String, Integer> getConnectedHostIPAndPort() {
        Map<String, Integer> connectedHostIPAndPortMap = Maps.newHashMap();
        for (NodeConnection conn : m_connections) {
            connectedHostIPAndPortMap.put(conn.getSocketAddress().getAddress().getHostAddress(), (conn.getSocketAddress().getPort()));
        }
        return Collections.unmodifiableMap(connectedHostIPAndPortMap);
    }

    private void updateAffinityTopology(VoltTable tables[]) {
        //First table contains the description of partition ids master/slave relationships
        VoltTable vt = tables[0];

        //In future let TOPO return cooked bytes when cooked and we use correct recipe
        boolean cooked = false;
        if (tables.length == 1) {
            //Just in case the new client connects to the old version of Volt that only returns 1 topology table
            // We're going to get the MPI back in this table, so subtract it out from the number of partitions.
            int numPartitions = vt.getRowCount() - 1;
            m_hashinator = new HashinatorLite(numPartitions); // legacy only
        } else {
            //Second table contains the hash function
            boolean advanced = tables[1].advanceRow();
            if (!advanced) {
                System.err.println("Topology description received from Volt was incomplete " +
                                   "performance will be lower because transactions can't be routed at this client");
                return;
            }
            m_hashinator = new HashinatorLite(
                    HashinatorLiteType.valueOf(tables[1].getString("HASHTYPE")),
                    tables[1].getVarbinary("HASHCONFIG"),
                    cooked);
        }
        m_partitionMasters.clear();
        m_partitionReplicas.clear();
        // The MPI's partition ID is 16383 (MpInitiator.MP_INIT_PID), so we shouldn't inadvertently
        // hash to it.  Go ahead and include it in the maps, we can use it at some point to
        // route MP transactions directly to the MPI node.
        Set<Integer> unconnected = new HashSet<Integer>();
        while (vt.advanceRow()) {
            Integer partition = (int)vt.getLong("Partition");

            ArrayList<NodeConnection> connections = new ArrayList<>();
            for (String site : vt.getString("Sites").split(",")) {
                site = site.trim();
                Integer hostId = Integer.valueOf(site.split(":")[0]);
                if (m_hostIdToConnection.containsKey(hostId)) {
                    connections.add(m_hostIdToConnection.get(hostId));
                } else {
                    unconnected.add(hostId);
               }
            }
            m_partitionReplicas.put(partition, connections.toArray(new NodeConnection[0]));

            Integer leaderHostId = Integer.valueOf(vt.getString("Leader").split(":")[0]);
            if (m_hostIdToConnection.containsKey(leaderHostId)) {
                m_partitionMasters.put(partition, m_hostIdToConnection.get(leaderHostId));
            }
        }
        if (m_topologyChangeAware) {
            m_unconnectedHosts.set(ImmutableSet.copyOf(unconnected));
        }
        refreshPartitionKeys(true);
    }

    private void updateProcedurePartitioning(VoltTable vt) {
        Map<String, Procedure> procs = Maps.newHashMap();
        while (vt.advanceRow()) {
            try {
                //Data embedded in JSON object in remarks column
                String jsString = vt.getString(6);
                String procedureName = vt.getString(2);
                JSONObject jsObj = new JSONObject(jsString);
                boolean readOnly = jsObj.getBoolean(Constants.JSON_READ_ONLY);
                if (jsObj.getBoolean(Constants.JSON_SINGLE_PARTITION)) {
                    int partitionParameter = jsObj.getInt(Constants.JSON_PARTITION_PARAMETER);
                    int partitionParameterType =
                        jsObj.getInt(Constants.JSON_PARTITION_PARAMETER_TYPE);
                    procs.put(procedureName,
                            new Procedure(false,readOnly, partitionParameter, partitionParameterType));
                } else {
                    // Multi Part procedure JSON descriptors omit the partitionParameter
                    procs.put(procedureName, new Procedure(true, readOnly, Procedure.PARAMETER_NONE,
                                Procedure.PARAMETER_NONE));
                }

            } catch (JSONException e) {
                e.printStackTrace();
            }
        }
        ImmutableSortedMap<String, Procedure> oldProcs = m_procedureInfo.get();
        m_procedureInfo.compareAndSet(oldProcs, ImmutableSortedMap.copyOf(procs));
    }

    private void updatePartitioning(VoltTable vt) {
        List<Integer> keySet = new ArrayList<Integer>();
        while (vt.advanceRow()) {
            //check for mock unit test
            if (vt.getColumnCount() == 2) {
                Integer key = (int)(vt.getLong("PARTITION_KEY"));
                keySet.add(key);
            }
        }
        m_partitionKeys.set(ImmutableSet.copyOf(keySet));
    }

    /**
     * Return if Hashinator is initialed. This is useful only for non standard clients.
     * This will only only ever return true if client affinity is turned on.
     *
     * @return
     */
    public boolean isHashinatorInitialized() {
        return (m_hashinator != null);
    }

    /**
     * This is used by clients such as CSVLoader which puts processing into buckets.
     *
     * @param typeValue volt Type
     * @param value the representative value
     * @return
     */
    public long getPartitionForParameter(byte typeValue, Object value) {
        if (m_hashinator == null) {
            return -1;
        }
        return m_hashinator.getHashedPartitionForParameter(typeValue, value);
    }

    public HashinatorLiteType getHashinatorType() {
        if (m_hashinator == null) {
            return HashinatorLiteType.LEGACY;
        }
        return m_hashinator.getConfigurationType();
    }

    private ByteBuffer serializeSPI(ProcedureInvocation pi) throws IOException {
        ByteBuffer buf = ByteBuffer.allocate(pi.getSerializedSize() + 4);
        buf.putInt(buf.capacity() - 4);
        pi.flattenToBuffer(buf);
        buf.flip();
        return buf;
    }

    long getProcedureTimeoutNanos() {
        return m_procedureCallTimeoutNanos;
    }

    ImmutableSet<Integer> getPartitionKeys() throws NoConnectionsException, IOException, ProcCallException {
        refreshPartitionKeys(false);

        if (m_partitionUpdateStatus.get().getStatus() != ClientResponse.SUCCESS) {
            throw new ProcCallException(m_partitionUpdateStatus.get(), null, null);
        }

        return m_partitionKeys.get();
    }

    /**
     * Set up partitions.
     * @param topologyUpdate  if true, it is called from topology update
     * @throws ProcCallException on any VoltDB specific failure.
     * @throws NoConnectionsException if this {@link Client} instance is not connected to any servers.
     * @throws IOException if there is a Java network or connection problem.
     */
    private void refreshPartitionKeys(boolean topologyUpdate)  {

        long interval = System.currentTimeMillis() - m_lastPartitionKeyFetched.get();
        if (!m_useClientAffinity && interval < PARTITION_KEYS_INFO_REFRESH_FREQUENCY) {
            return;
        }

        try {
            ProcedureInvocation invocation = new ProcedureInvocation(m_sysHandle.getAndDecrement(), "@GetPartitionKeys", "INTEGER");
            CountDownLatch latch = null;

            if (!topologyUpdate) {
                latch = new CountDownLatch(1);
            }
            PartitionUpdateCallback cb = new PartitionUpdateCallback(latch);
            if (!queue(invocation, cb, true, System.nanoTime(), USE_DEFAULT_CLIENT_TIMEOUT)) {
                m_partitionUpdateStatus.set(new ClientResponseImpl(ClientResponseImpl.SERVER_UNAVAILABLE, new VoltTable[0],
                        "Fails to queue the partition update query, please try later."));
            }
            if (!topologyUpdate) {
                latch.await();
            }
            m_lastPartitionKeyFetched.set(System.currentTimeMillis());
        } catch (InterruptedException | IOException e) {
            m_partitionUpdateStatus.set(new ClientResponseImpl(ClientResponseImpl.SERVER_UNAVAILABLE, new VoltTable[0],
                    "Fails to fetch partition keys from server:" + e.getMessage()));
        }
    }

    void setTopologyChangeAware(boolean topoAware) {
        m_topologyChangeAware = topoAware;
    }

    void createConnectionsUponTopologyChange() {

        if(!m_topologyChangeAware || m_createConnectionUponTopoChangeInProgress.get()) {
            return;
        }
        m_createConnectionUponTopoChangeInProgress.set(true);
        ImmutableSet<Integer> unconnected = m_unconnectedHosts.get();
        if (unconnected != null && !unconnected.isEmpty()) {
            m_unconnectedHosts.compareAndSet(unconnected, ImmutableSet.copyOf(new HashSet<Integer>()));
            for (Integer host : unconnected) {
                if (!isHostConnected(host)) {
                    for (ClientStatusListenerExt csl : m_listeners) {
                        if (csl instanceof ClientImpl.InternalClientStatusListener) {
                            ((ClientImpl.InternalClientStatusListener)csl).createConnectionsUponTopologyChange();
                            break;
                        }
                    }
                }
            }
        }
        m_createConnectionUponTopoChangeInProgress.set(false);
    }

    void setCreateConnectionsUponTopologyChangeComplete() throws NoConnectionsException {
        m_createConnectionUponTopoChangeInProgress.set(false);
        ProcedureInvocation spi = new ProcedureInvocation(m_sysHandle.getAndDecrement(), "@Statistics", "TOPO", 0);
        queue(spi, new TopoUpdateCallback(), true, System.nanoTime(), USE_DEFAULT_CLIENT_TIMEOUT);
    }
    boolean isHostConnected(Integer hostId) {
        return m_hostIdToConnection.containsKey(hostId);
    }
}