/*
*
* Copyright 2013 Netflix, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package com.netflix.loadbalancer;
import java.util.Date;
import java.util.Random;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import com.google.common.annotations.VisibleForTesting;
import com.netflix.config.DynamicIntProperty;
import com.netflix.config.DynamicPropertyFactory;
import com.netflix.servo.annotations.DataSourceType;
import com.netflix.servo.annotations.Monitor;
import com.netflix.stats.distribution.DataDistribution;
import com.netflix.stats.distribution.DataPublisher;
import com.netflix.stats.distribution.Distribution;
import com.netflix.util.MeasuredRate;
/**
* Capture various stats per Server(node) in the LoadBalancer
* @author stonse
*
*/
public class ServerStats {
private static final int DEFAULT_PUBLISH_INTERVAL = 60 * 1000; // = 1 minute
private static final int DEFAULT_BUFFER_SIZE = 60 * 1000; // = 1000 requests/sec for 1 minute
private final DynamicIntProperty connectionFailureThreshold;
private final DynamicIntProperty circuitTrippedTimeoutFactor;
private final DynamicIntProperty maxCircuitTrippedTimeout;
private static final DynamicIntProperty activeRequestsCountTimeout =
DynamicPropertyFactory.getInstance().getIntProperty("niws.loadbalancer.serverStats.activeRequestsCount.effectiveWindowSeconds", 60 * 10);
private static final double[] PERCENTS = makePercentValues();
private DataDistribution dataDist = new DataDistribution(1, PERCENTS); // in case
private DataPublisher publisher = null;
private final Distribution responseTimeDist = new Distribution();
int bufferSize = DEFAULT_BUFFER_SIZE;
int publishInterval = DEFAULT_PUBLISH_INTERVAL;
long failureCountSlidingWindowInterval = 1000;
private MeasuredRate serverFailureCounts = new MeasuredRate(failureCountSlidingWindowInterval);
private MeasuredRate requestCountInWindow = new MeasuredRate(300000L);
Server server;
AtomicLong totalRequests = new AtomicLong();
@VisibleForTesting
AtomicInteger successiveConnectionFailureCount = new AtomicInteger(0);
@VisibleForTesting
AtomicInteger activeRequestsCount = new AtomicInteger(0);
@VisibleForTesting
AtomicInteger openConnectionsCount = new AtomicInteger(0);
private volatile long lastConnectionFailedTimestamp;
private volatile long lastActiveRequestsCountChangeTimestamp;
private AtomicLong totalCircuitBreakerBlackOutPeriod = new AtomicLong(0);
private volatile long lastAccessedTimestamp;
private volatile long firstConnectionTimestamp = 0;
public ServerStats() {
connectionFailureThreshold = DynamicPropertyFactory.getInstance().getIntProperty(
"niws.loadbalancer.default.connectionFailureCountThreshold", 3);
circuitTrippedTimeoutFactor = DynamicPropertyFactory.getInstance().getIntProperty(
"niws.loadbalancer.default.circuitTripTimeoutFactorSeconds", 10);
maxCircuitTrippedTimeout = DynamicPropertyFactory.getInstance().getIntProperty(
"niws.loadbalancer.default.circuitTripMaxTimeoutSeconds", 30);
}
public ServerStats(LoadBalancerStats lbStats) {
this.maxCircuitTrippedTimeout = lbStats.getCircuitTripMaxTimeoutSeconds();
this.circuitTrippedTimeoutFactor = lbStats.getCircuitTrippedTimeoutFactor();
this.connectionFailureThreshold = lbStats.getConnectionFailureCountThreshold();
}
/**
* Initializes the object, starting data collection and reporting.
*/
public void initialize(Server server) {
serverFailureCounts = new MeasuredRate(failureCountSlidingWindowInterval);
requestCountInWindow = new MeasuredRate(300000L);
if (publisher == null) {
dataDist = new DataDistribution(getBufferSize(), PERCENTS);
publisher = new DataPublisher(dataDist, getPublishIntervalMillis());
publisher.start();
}
this.server = server;
}
public void close() {
if (publisher != null)
publisher.stop();
}
private int getBufferSize() {
return bufferSize;
}
private long getPublishIntervalMillis() {
return publishInterval;
}
public void setBufferSize(int bufferSize) {
this.bufferSize = bufferSize;
}
public void setPublishInterval(int publishInterval) {
this.publishInterval = publishInterval;
}
/**
* The supported percentile values.
* These correspond to the various Monitor methods defined below.
* No, this is not pretty, but that's the way it is.
*/
private static enum Percent {
TEN(10), TWENTY_FIVE(25), FIFTY(50), SEVENTY_FIVE(75), NINETY(90),
NINETY_FIVE(95), NINETY_EIGHT(98), NINETY_NINE(99), NINETY_NINE_POINT_FIVE(99.5);
private double val;
Percent(double val) {
this.val = val;
}
public double getValue() {
return val;
}
}
private static double[] makePercentValues() {
Percent[] percents = Percent.values();
double[] p = new double[percents.length];
for (int i = 0; i < percents.length; i++) {
p[i] = percents[i].getValue();
}
return p;
}
public long getFailureCountSlidingWindowInterval() {
return failureCountSlidingWindowInterval;
}
public void setFailureCountSlidingWindowInterval(
long failureCountSlidingWindowInterval) {
this.failureCountSlidingWindowInterval = failureCountSlidingWindowInterval;
}
// run time methods
/**
* Increment the count of failures for this Server
*
*/
public void addToFailureCount(){
serverFailureCounts.increment();
}
/**
* Returns the count of failures in the current window
*
*/
public long getFailureCount(){
return serverFailureCounts.getCurrentCount();
}
/**
* Call this method to note the response time after every request
* @param msecs
*/
public void noteResponseTime(double msecs){
dataDist.noteValue(msecs);
responseTimeDist.noteValue(msecs);
}
public void incrementNumRequests(){
totalRequests.incrementAndGet();
}
public void incrementActiveRequestsCount() {
activeRequestsCount.incrementAndGet();
requestCountInWindow.increment();
long currentTime = System.currentTimeMillis();
lastActiveRequestsCountChangeTimestamp = currentTime;
lastAccessedTimestamp = currentTime;
if (firstConnectionTimestamp == 0) {
firstConnectionTimestamp = currentTime;
}
}
public void incrementOpenConnectionsCount() {
openConnectionsCount.incrementAndGet();
}
public void decrementActiveRequestsCount() {
if (activeRequestsCount.decrementAndGet() < 0) {
activeRequestsCount.set(0);
}
lastActiveRequestsCountChangeTimestamp = System.currentTimeMillis();
}
public void decrementOpenConnectionsCount() {
if (openConnectionsCount.decrementAndGet() < 0) {
openConnectionsCount.set(0);
}
}
public int getActiveRequestsCount() {
return getActiveRequestsCount(System.currentTimeMillis());
}
public int getActiveRequestsCount(long currentTime) {
int count = activeRequestsCount.get();
if (count == 0) {
return 0;
} else if (currentTime - lastActiveRequestsCountChangeTimestamp > activeRequestsCountTimeout.get() * 1000 || count < 0) {
activeRequestsCount.set(0);
return 0;
} else {
return count;
}
}
public int getOpenConnectionsCount() {
return openConnectionsCount.get();
}
public long getMeasuredRequestsCount() {
return requestCountInWindow.getCount();
}
@Monitor(name="ActiveRequestsCount", type = DataSourceType.GAUGE)
public int getMonitoredActiveRequestsCount() {
return activeRequestsCount.get();
}
@Monitor(name="CircuitBreakerTripped", type = DataSourceType.INFORMATIONAL)
public boolean isCircuitBreakerTripped() {
return isCircuitBreakerTripped(System.currentTimeMillis());
}
public boolean isCircuitBreakerTripped(long currentTime) {
long circuitBreakerTimeout = getCircuitBreakerTimeout();
if (circuitBreakerTimeout <= 0) {
return false;
}
return circuitBreakerTimeout > currentTime;
}
private long getCircuitBreakerTimeout() {
long blackOutPeriod = getCircuitBreakerBlackoutPeriod();
if (blackOutPeriod <= 0) {
return 0;
}
return lastConnectionFailedTimestamp + blackOutPeriod;
}
private long getCircuitBreakerBlackoutPeriod() {
int failureCount = successiveConnectionFailureCount.get();
int threshold = connectionFailureThreshold.get();
if (failureCount < threshold) {
return 0;
}
int diff = (failureCount - threshold) > 16 ? 16 : (failureCount - threshold);
int blackOutSeconds = (1 << diff) * circuitTrippedTimeoutFactor.get();
if (blackOutSeconds > maxCircuitTrippedTimeout.get()) {
blackOutSeconds = maxCircuitTrippedTimeout.get();
}
return blackOutSeconds * 1000L;
}
public void incrementSuccessiveConnectionFailureCount() {
lastConnectionFailedTimestamp = System.currentTimeMillis();
successiveConnectionFailureCount.incrementAndGet();
totalCircuitBreakerBlackOutPeriod.addAndGet(getCircuitBreakerBlackoutPeriod());
}
public void clearSuccessiveConnectionFailureCount() {
successiveConnectionFailureCount.set(0);
}
@Monitor(name="SuccessiveConnectionFailureCount", type = DataSourceType.GAUGE)
public int getSuccessiveConnectionFailureCount() {
return successiveConnectionFailureCount.get();
}
/*
* Response total times
*/
/**
* Gets the average total amount of time to handle a request, in milliseconds.
*/
@Monitor(name = "OverallResponseTimeMillisAvg", type = DataSourceType.INFORMATIONAL,
description = "Average total time for a request, in milliseconds")
public double getResponseTimeAvg() {
return responseTimeDist.getMean();
}
/**
* Gets the maximum amount of time spent handling a request, in milliseconds.
*/
@Monitor(name = "OverallResponseTimeMillisMax", type = DataSourceType.INFORMATIONAL,
description = "Max total time for a request, in milliseconds")
public double getResponseTimeMax() {
return responseTimeDist.getMaximum();
}
/**
* Gets the minimum amount of time spent handling a request, in milliseconds.
*/
@Monitor(name = "OverallResponseTimeMillisMin", type = DataSourceType.INFORMATIONAL,
description = "Min total time for a request, in milliseconds")
public double getResponseTimeMin() {
return responseTimeDist.getMinimum();
}
/**
* Gets the standard deviation in the total amount of time spent handling a request, in milliseconds.
*/
@Monitor(name = "OverallResponseTimeMillisStdDev", type = DataSourceType.INFORMATIONAL,
description = "Standard Deviation in total time to handle a request, in milliseconds")
public double getResponseTimeStdDev() {
return responseTimeDist.getStdDev();
}
/*
* QOS percentile performance data for most recent period
*/
/**
* Gets the number of samples used to compute the various response-time percentiles.
*/
@Monitor(name = "ResponseTimePercentileNumValues", type = DataSourceType.GAUGE,
description = "The number of data points used to compute the currently reported percentile values")
public int getResponseTimePercentileNumValues() {
return dataDist.getSampleSize();
}
/**
* Gets the time when the varios percentile data was last updated.
*/
@Monitor(name = "ResponseTimePercentileWhen", type = DataSourceType.INFORMATIONAL,
description = "The time the percentile values were computed")
public String getResponseTimePercentileTime() {
return dataDist.getTimestamp();
}
/**
* Gets the time when the varios percentile data was last updated,
* in milliseconds since the epoch.
*/
@Monitor(name = "ResponseTimePercentileWhenMillis", type = DataSourceType.COUNTER,
description = "The time the percentile values were computed in milliseconds since the epoch")
public long getResponseTimePercentileTimeMillis() {
return dataDist.getTimestampMillis();
}
/**
* Gets the average total amount of time to handle a request
* in the recent time-slice, in milliseconds.
*/
@Monitor(name = "ResponseTimeMillisAvg", type = DataSourceType.GAUGE,
description = "Average total time for a request in the recent time slice, in milliseconds")
public double getResponseTimeAvgRecent() {
return dataDist.getMean();
}
/**
* Gets the 10-th percentile in the total amount of time spent handling a request, in milliseconds.
*/
@Monitor(name = "ResponseTimeMillis10Percentile", type = DataSourceType.INFORMATIONAL,
description = "10th percentile in total time to handle a request, in milliseconds")
public double getResponseTime10thPercentile() {
return getResponseTimePercentile(Percent.TEN);
}
/**
* Gets the 25-th percentile in the total amount of time spent handling a request, in milliseconds.
*/
@Monitor(name = "ResponseTimeMillis25Percentile", type = DataSourceType.INFORMATIONAL,
description = "25th percentile in total time to handle a request, in milliseconds")
public double getResponseTime25thPercentile() {
return getResponseTimePercentile(Percent.TWENTY_FIVE);
}
/**
* Gets the 50-th percentile in the total amount of time spent handling a request, in milliseconds.
*/
@Monitor(name = "ResponseTimeMillis50Percentile", type = DataSourceType.INFORMATIONAL,
description = "50th percentile in total time to handle a request, in milliseconds")
public double getResponseTime50thPercentile() {
return getResponseTimePercentile(Percent.FIFTY);
}
/**
* Gets the 75-th percentile in the total amount of time spent handling a request, in milliseconds.
*/
@Monitor(name = "ResponseTimeMillis75Percentile", type = DataSourceType.INFORMATIONAL,
description = "75th percentile in total time to handle a request, in milliseconds")
public double getResponseTime75thPercentile() {
return getResponseTimePercentile(Percent.SEVENTY_FIVE);
}
/**
* Gets the 90-th percentile in the total amount of time spent handling a request, in milliseconds.
*/
@Monitor(name = "ResponseTimeMillis90Percentile", type = DataSourceType.INFORMATIONAL,
description = "90th percentile in total time to handle a request, in milliseconds")
public double getResponseTime90thPercentile() {
return getResponseTimePercentile(Percent.NINETY);
}
/**
* Gets the 95-th percentile in the total amount of time spent handling a request, in milliseconds.
*/
@Monitor(name = "ResponseTimeMillis95Percentile", type = DataSourceType.GAUGE,
description = "95th percentile in total time to handle a request, in milliseconds")
public double getResponseTime95thPercentile() {
return getResponseTimePercentile(Percent.NINETY_FIVE);
}
/**
* Gets the 98-th percentile in the total amount of time spent handling a request, in milliseconds.
*/
@Monitor(name = "ResponseTimeMillis98Percentile", type = DataSourceType.INFORMATIONAL,
description = "98th percentile in total time to handle a request, in milliseconds")
public double getResponseTime98thPercentile() {
return getResponseTimePercentile(Percent.NINETY_EIGHT);
}
/**
* Gets the 99-th percentile in the total amount of time spent handling a request, in milliseconds.
*/
@Monitor(name = "ResponseTimeMillis99Percentile", type = DataSourceType.GAUGE,
description = "99th percentile in total time to handle a request, in milliseconds")
public double getResponseTime99thPercentile() {
return getResponseTimePercentile(Percent.NINETY_NINE);
}
/**
* Gets the 99.5-th percentile in the total amount of time spent handling a request, in milliseconds.
*/
@Monitor(name = "ResponseTimeMillis99_5Percentile", type = DataSourceType.GAUGE,
description = "99.5th percentile in total time to handle a request, in milliseconds")
public double getResponseTime99point5thPercentile() {
return getResponseTimePercentile(Percent.NINETY_NINE_POINT_FIVE);
}
public long getTotalRequestsCount() {
return totalRequests.get();
}
private double getResponseTimePercentile(Percent p) {
return dataDist.getPercentiles()[p.ordinal()];
}
public String toString(){
StringBuilder sb = new StringBuilder();
sb.append("[Server:" + server + ";");
sb.append("\tZone:" + server.getZone() + ";");
sb.append("\tTotal Requests:" + totalRequests + ";");
sb.append("\tSuccessive connection failure:" + getSuccessiveConnectionFailureCount() + ";");
if (isCircuitBreakerTripped()) {
sb.append("\tBlackout until: " + new Date(getCircuitBreakerTimeout()) + ";");
}
sb.append("\tTotal blackout seconds:" + totalCircuitBreakerBlackOutPeriod.get() / 1000 + ";");
sb.append("\tLast connection made:" + new Date(lastAccessedTimestamp) + ";");
if (lastConnectionFailedTimestamp > 0) {
sb.append("\tLast connection failure: " + new Date(lastConnectionFailedTimestamp) + ";");
}
sb.append("\tFirst connection made: " + new Date(firstConnectionTimestamp) + ";");
sb.append("\tActive Connections:" + getMonitoredActiveRequestsCount() + ";");
sb.append("\ttotal failure count in last (" + failureCountSlidingWindowInterval + ") msecs:" + getFailureCount() + ";");
sb.append("\taverage resp time:" + getResponseTimeAvg() + ";");
sb.append("\t90 percentile resp time:" + getResponseTime90thPercentile() + ";");
sb.append("\t95 percentile resp time:" + getResponseTime95thPercentile() + ";");
sb.append("\tmin resp time:" + getResponseTimeMin() + ";");
sb.append("\tmax resp time:" + getResponseTimeMax() + ";");
sb.append("\tstddev resp time:" + getResponseTimeStdDev());
sb.append("]\n");
return sb.toString();
}
public static void main(String[] args){
ServerStats ss = new ServerStats();
ss.setBufferSize(1000);
ss.setPublishInterval(1000);
ss.initialize(new Server("stonse", 80));
Random r = new Random(1459834);
for (int i=0; i < 99; i++){
double rl = r.nextDouble() * 25.2;
ss.noteResponseTime(rl);
ss.incrementNumRequests();
try {
Thread.sleep(100);
System.out.println("ServerStats:avg:" + ss.getResponseTimeAvg());
System.out.println("ServerStats:90 percentile:" + ss.getResponseTime90thPercentile());
System.out.println("ServerStats:90 percentile:" + ss.getResponseTimePercentileNumValues());
} catch (InterruptedException e) {
}
}
System.out.println("done ---");
ss.publisher.stop();
System.out.println("ServerStats:" + ss);
}
}