/*
* Copyright 2015 Netflix, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.netflix.discovery.shared.transport.decorator;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.concurrent.ConcurrentSkipListSet;
import java.util.concurrent.atomic.AtomicReference;
import com.netflix.discovery.shared.resolver.ClusterResolver;
import com.netflix.discovery.shared.resolver.EurekaEndpoint;
import com.netflix.discovery.shared.transport.EurekaHttpClient;
import com.netflix.discovery.shared.transport.EurekaHttpClientFactory;
import com.netflix.discovery.shared.transport.EurekaHttpResponse;
import com.netflix.discovery.shared.transport.EurekaTransportConfig;
import com.netflix.discovery.shared.transport.TransportClientFactory;
import com.netflix.discovery.shared.transport.TransportException;
import com.netflix.discovery.shared.transport.TransportUtils;
import com.netflix.servo.annotations.DataSourceType;
import com.netflix.servo.annotations.Monitor;
import com.netflix.servo.monitor.Monitors;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static com.netflix.discovery.EurekaClientNames.METRIC_TRANSPORT_PREFIX;
/**
* {@link RetryableEurekaHttpClient} retries failed requests on subsequent servers in the cluster.
* It maintains also simple quarantine list, so operations are not retried again on servers
* that are not reachable at the moment.
* <h3>Quarantine</h3>
* All the servers to which communication failed are put on the quarantine list. First successful execution
* clears this list, which makes those server eligible for serving future requests.
* The list is also cleared once all available servers are exhausted.
* <h3>5xx</h3>
* If 5xx status code is returned, {@link ServerStatusEvaluator} predicate evaluates if the retries should be
* retried on another server, or the response with this status code returned to the client.
*
* @author Tomasz Bak
*/
public class RetryableEurekaHttpClient extends EurekaHttpClientDecorator {
private static final Logger logger = LoggerFactory.getLogger(RetryableEurekaHttpClient.class);
public static final int DEFAULT_NUMBER_OF_RETRIES = 3;
private final String name;
private final EurekaTransportConfig transportConfig;
private final ClusterResolver clusterResolver;
private final TransportClientFactory clientFactory;
private final ServerStatusEvaluator serverStatusEvaluator;
private final int numberOfRetries;
private final AtomicReference<EurekaHttpClient> delegate = new AtomicReference<>();
private final Set<EurekaEndpoint> quarantineSet = new ConcurrentSkipListSet<>();
public RetryableEurekaHttpClient(String name,
EurekaTransportConfig transportConfig,
ClusterResolver clusterResolver,
TransportClientFactory clientFactory,
ServerStatusEvaluator serverStatusEvaluator,
int numberOfRetries) {
this.name = name;
this.transportConfig = transportConfig;
this.clusterResolver = clusterResolver;
this.clientFactory = clientFactory;
this.serverStatusEvaluator = serverStatusEvaluator;
this.numberOfRetries = numberOfRetries;
Monitors.registerObject(name, this);
}
@Override
public void shutdown() {
TransportUtils.shutdown(delegate.get());
if(Monitors.isObjectRegistered(name, this)) {
Monitors.unregisterObject(name, this);
}
}
@Override
protected <R> EurekaHttpResponse<R> execute(RequestExecutor<R> requestExecutor) {
List<EurekaEndpoint> candidateHosts = null;
int endpointIdx = 0;
for (int retry = 0; retry < numberOfRetries; retry++) {
EurekaHttpClient currentHttpClient = delegate.get();
EurekaEndpoint currentEndpoint = null;
if (currentHttpClient == null) {
if (candidateHosts == null) {
candidateHosts = getHostCandidates();
if (candidateHosts.isEmpty()) {
throw new TransportException("There is no known eureka server; cluster server list is empty");
}
}
if (endpointIdx >= candidateHosts.size()) {
throw new TransportException("Cannot execute request on any known server");
}
currentEndpoint = candidateHosts.get(endpointIdx++);
currentHttpClient = clientFactory.newClient(currentEndpoint);
}
try {
EurekaHttpResponse<R> response = requestExecutor.execute(currentHttpClient);
if (serverStatusEvaluator.accept(response.getStatusCode(), requestExecutor.getRequestType())) {
delegate.set(currentHttpClient);
if (retry > 0) {
logger.info("Request execution succeeded on retry #{}", retry);
}
return response;
}
logger.warn("Request execution failure with status code {}; retrying on another server if available", response.getStatusCode());
} catch (Exception e) {
logger.warn("Request execution failed with message: {}", e.getMessage()); // just log message as the underlying client should log the stacktrace
}
// Connection error or 5xx from the server that must be retried on another server
delegate.compareAndSet(currentHttpClient, null);
if (currentEndpoint != null) {
quarantineSet.add(currentEndpoint);
}
}
throw new TransportException("Retry limit reached; giving up on completing the request");
}
public static EurekaHttpClientFactory createFactory(final String name,
final EurekaTransportConfig transportConfig,
final ClusterResolver<EurekaEndpoint> clusterResolver,
final TransportClientFactory delegateFactory,
final ServerStatusEvaluator serverStatusEvaluator) {
return new EurekaHttpClientFactory() {
@Override
public EurekaHttpClient newClient() {
return new RetryableEurekaHttpClient(name, transportConfig, clusterResolver, delegateFactory,
serverStatusEvaluator, DEFAULT_NUMBER_OF_RETRIES);
}
@Override
public void shutdown() {
delegateFactory.shutdown();
}
};
}
private List<EurekaEndpoint> getHostCandidates() {
List<EurekaEndpoint> candidateHosts = clusterResolver.getClusterEndpoints();
quarantineSet.retainAll(candidateHosts);
// If enough hosts are bad, we have no choice but start over again
int threshold = (int) (candidateHosts.size() * transportConfig.getRetryableClientQuarantineRefreshPercentage());
if (quarantineSet.isEmpty()) {
// no-op
} else if (quarantineSet.size() >= threshold) {
logger.debug("Clearing quarantined list of size {}", quarantineSet.size());
quarantineSet.clear();
} else {
List<EurekaEndpoint> remainingHosts = new ArrayList<>(candidateHosts.size());
for (EurekaEndpoint endpoint : candidateHosts) {
if (!quarantineSet.contains(endpoint)) {
remainingHosts.add(endpoint);
}
}
candidateHosts = remainingHosts;
}
return candidateHosts;
}
@Monitor(name = METRIC_TRANSPORT_PREFIX + "quarantineSize",
description = "number of servers quarantined", type = DataSourceType.GAUGE)
public long getQuarantineSetSize() {
return quarantineSet.size();
}
}