AdmissionControlGroup.java example

Explorer
voltdb-master
/* This file is part of VoltDB.
 * Copyright (C) 2008-2017 VoltDB Inc.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with VoltDB.  If not, see <http://www.gnu.org/licenses/>.
 */
package org.voltdb;

import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.TimeUnit;

import org.HdrHistogram_voltpatches.AbstractHistogram;
import org.cliffc_voltpatches.high_scale_lib.NonBlockingHashMap;
import org.voltcore.logging.VoltLogger;
import org.voltdb.dtxn.InitiatorStats.InvocationInfo;
import org.voltdb.dtxn.LatencyHistogramStats;

/**
 * Manage admission control for incoming requests by tracking the size of outstanding requests
 * as well as the number of outstanding requests in order to bound the total amount of memory
 * used to service the requests.
 *
 * Responses are also tracked by size although not by count and use the same resource pool
 * as incoming requests size wise. Count of responses is not tracked because they are flattened
 * and coalesced to byte buffers so the metadata overhead is minimal and not worth tracking separately
 *
 * Admission control only limits the amount of work each group is willing to accept into the cluster.
 * Because there is no coordination between groups it is possible for all the work to end up at one node.
 * This is guaranteed to happen if one node is slow enough that it can't keep up with the workload.
 */
public class AdmissionControlGroup implements org.voltcore.network.QueueMonitor
{
    private static final VoltLogger networkLog = new VoltLogger("NETWORK");


    /*
     * Maximum values for each group are configured when the group is constructed
     */
    final private int MAX_DESIRED_PENDING_BYTES;
    final private int LESS_THAN_MAX_DESIRED_PENDING_BYTES;
    final private int MAX_DESIRED_PENDING_TXNS;
    final private int LESS_THAN_MAX_DESIRED_PENDING_TXNS;

    private static final VoltLogger hostLog = new VoltLogger("HOST");

    private int m_pendingTxnCount = 0;
    private long m_pendingTxnBytes = 0;
    private boolean m_hadBackPressure = false;

    /*
     * If for some reason ACG logs a negative transaction count or outstanding bytes,
     * only do it once to avoid flooding. Not going to make it a fatal error, but
     * don't want it to fail silently either.
     */
    private boolean m_haveLoggedACGNegativeFailure = false;

    /*
     * The ACG is accessed lock free from the network thread that owns the group.
     * Track the thread id of callers of this class to ensure that it is always the right
     * thread using assertions.
     */
    private final long m_expectedThreadId = Thread.currentThread().getId();

    /*
     * Members of the admission control group implement this interface and are expected
     * to stop accepting new requests when after onBackpressure is invoked and are allowed
     * to start accepting requests again once offBackpressure is invoked
     */
    public interface ACGMember
    {
        public void onBackpressure();
        public void offBackpressure();
        public long connectionId();
    }



    private final HashSet<ACGMember> m_members = new HashSet<ACGMember>();

    /*
     * Reads/writes to the actual InvocationInfo are unsynchronized. There is a single writer
     * so no issues there, but the reader is unprotected.
     *
     * There is only one writer so a single stripe is fine
     */
    private final ConcurrentHashMap<Long, Map<String, org.voltdb.dtxn.InitiatorStats.InvocationInfo>> m_connectionStates =
                 new ConcurrentHashMap<Long, Map<String, org.voltdb.dtxn.InitiatorStats.InvocationInfo>>(1024, .75f, 1);

    private final AbstractHistogram m_latencyInfo = LatencyHistogramStats.constructHistogram(true);

    public AdmissionControlGroup(int maxBytes, int maxRequests)
    {
        MAX_DESIRED_PENDING_BYTES = maxBytes;
        LESS_THAN_MAX_DESIRED_PENDING_BYTES = (int)(MAX_DESIRED_PENDING_BYTES * .8);
        MAX_DESIRED_PENDING_TXNS = maxRequests;
        LESS_THAN_MAX_DESIRED_PENDING_TXNS = (int)(MAX_DESIRED_PENDING_TXNS * .8);
    }

    public static AdmissionControlGroup getDummy() {
        return new AdmissionControlGroup(Integer.MAX_VALUE, Integer.MAX_VALUE) {
            @Override
            public void addMember(ACGMember member) {}
            @Override
            public void removeMember(ACGMember member) {}
            @Override
            public void increaseBackpressure(int messageSize) {}
            @Override
            public void reduceBackpressure(int messageSize) {}
            @Override
            public boolean queue(int bytes) { return false; }
        };
    }

    public void addMember(ACGMember member)
    {
        assert(m_expectedThreadId == Thread.currentThread().getId());
        m_members.add(member);
    }

    public void removeMember(ACGMember member)
    {
        assert(m_expectedThreadId == Thread.currentThread().getId());
        m_members.remove(member);
        m_connectionStates.remove(member.connectionId());
    }

    /*
     * Invoked when accepting a new transaction. Increments pending txn count in addition
     * to tracking the number of request bytes accepted. Can invoke onBackpressure
     * on all group members if there isn't already a backpressure condition.
     */
    public void increaseBackpressure(int messageSize)
    {
        assert(m_expectedThreadId == Thread.currentThread().getId());
        if (messageSize < 1) {
            throw new IllegalArgumentException("Message size must be > 0 but was " + messageSize);
        }
        m_pendingTxnBytes += messageSize;
        m_pendingTxnCount++;

        checkAndLogInvariants();
        if (m_pendingTxnBytes > MAX_DESIRED_PENDING_BYTES || m_pendingTxnCount > MAX_DESIRED_PENDING_TXNS) {
            if (!m_hadBackPressure) {
                hostLog.debug("TXN back pressure began");
                m_hadBackPressure = true;
                for (ACGMember m : m_members) {
                    m.onBackpressure();
                }
            }
        }
    }

    /*
     * Check that various invariants are maintained. If they aren't log the error at most once,
     * and take corrective action to maintain the invariants
     */
    private void checkAndLogInvariants() {
        if (m_pendingTxnCount < 0 || m_pendingTxnBytes < 0) {
            boolean badTxnCount = m_pendingTxnCount < 0 ? true : false;
            boolean badPendingBytes = m_pendingTxnBytes < 0 ? true : false;
            if (!m_haveLoggedACGNegativeFailure) {
                m_haveLoggedACGNegativeFailure = true;
                if (badTxnCount) {
                    networkLog.error("Admission control error, negative outstanding transaction count. " +
                            "This is error is not fatal, but it does indicate that admission control " +
                            "is not correctly tracking transaction resource usage. This message will not repeat " +
                            "the next time the condition occurs to avoid log spam");
                }
                if (badPendingBytes) {
                    networkLog.error(
                            "Backpressure reports a negative outstanding transaction byte count (" +
                            m_pendingTxnBytes +
                            "). No action required.", new RuntimeException("for stack trace purposes"));
                }
            }

            /*
             * Repair both. It's possible that repairing it will trigger a repair cascade
             * effectively rendering the ACG always permissive, but it should right itself
             * once all requests associated with the ACG have left the system and the correct values are indeed 0.
             */
            m_pendingTxnCount = 0;
            m_pendingTxnBytes = 0;
        }
    }

    /*
     * Invoked when receiving a response to a transaction. Decrements pending txn count in addition
     * to tracking the number of request bytes accepted. Can invoke offBackpressure
     * on all group members if there is a backpressure condition that has ended
     */
    public void reduceBackpressure(int messageSize)
    {
        assert(m_expectedThreadId == Thread.currentThread().getId());
        if (messageSize < 1) {
            throw new IllegalArgumentException("Message size must be > 0 but was " + messageSize);
        }

        m_pendingTxnBytes -= messageSize;
        m_pendingTxnCount--;
        checkAndLogInvariants();

        if ((m_pendingTxnBytes < LESS_THAN_MAX_DESIRED_PENDING_BYTES) &&
            (m_pendingTxnCount < LESS_THAN_MAX_DESIRED_PENDING_TXNS))
        {
            if (m_hadBackPressure) {
                hostLog.debug("TXN backpressure ended");
                m_hadBackPressure = false;
                for (ACGMember m : m_members) {
                    m.offBackpressure();
                }
            }
        }
    }

    /*
     * When accepting new connections this flag is used to decide whether read selection should be enabled.
     * Read selection should not be enabled if there is currently backpressure
     */
    public boolean hasBackPressure() {
        return m_hadBackPressure;
    }

    /**
     * Used by tests.
     * @return
     */
    public long getPendingBytes() {
        return m_pendingTxnBytes;
    }

    /*
     * Invoked when queueing response bytes back to a connection. Can be invoked with positive/negative
     * values to indicate whether data is being flushed or added. The same resource pool counter is used
     * to track pending responses as is used to track outstanding requests although only the bytes are being
     * counted.
     *
     * Can signal start/stop backpressure when appropriate. The return value is not used and will be removed
     * in the future.
     */
    @Override
    public boolean queue(int bytes) {
        m_pendingTxnBytes += bytes;
        checkAndLogInvariants();

        if (m_pendingTxnBytes > MAX_DESIRED_PENDING_BYTES) {
            if (!m_hadBackPressure) {
                hostLog.debug("TXN back pressure began");
                m_hadBackPressure = true;
                for (ACGMember m : m_members) {
                    m.onBackpressure();
                }
            }
        }
        else if ((m_pendingTxnBytes < LESS_THAN_MAX_DESIRED_PENDING_BYTES) &&
                (m_pendingTxnCount < LESS_THAN_MAX_DESIRED_PENDING_TXNS)) {
            if (m_hadBackPressure) {
                hostLog.debug("TXN backpressure ended");
                m_hadBackPressure = false;
                for (ACGMember m : m_members) {
                    m.offBackpressure();
                }
            }
        }

        return false;
    }

    public void logTransactionCompleted(
            long connectionId,
            String connectionHostname,
            String procedureName,
            long deltaNanos,
            byte status) {
        boolean needToInsert = false;
        Map<String, InvocationInfo> procInfoMap = m_connectionStates.get(connectionId);
        if (procInfoMap == null) {
            procInfoMap = new NonBlockingHashMap<String, InvocationInfo>();
            needToInsert = true;
        }
        InvocationInfo info = procInfoMap.get(procedureName);
        if(info == null){
            info = new InvocationInfo(connectionHostname);
            procInfoMap.put(procedureName, info);
        }
        info.processInvocation((int)TimeUnit.NANOSECONDS.toMillis(deltaNanos), status);
        // ENG-7209 This is to not log the latency value for a snapshot restore, as this just creates
        // a large initial value in the graph which is not actually relevant to the user.
        if (!procedureName.equals("@SnapshotRestore")) {
            m_latencyInfo.recordValue(Math.max(1, Math.min(TimeUnit.NANOSECONDS.toMicros(deltaNanos), m_latencyInfo.getHighestTrackableValue())));
        }
        if (needToInsert) {
            m_connectionStates.put(connectionId, procInfoMap);
        }
    }

    public Iterator<Map.Entry<Long, Map<String, InvocationInfo>>> getInitiationStatsIterator() {
        return m_connectionStates.entrySet().iterator();
    }

    public AbstractHistogram getLatencyInfo() {
        return m_latencyInfo;
    }
}