/*
* Copyright 2013 Cloudera Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.client.solrj.retry;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import java.util.regex.Pattern;
import org.apache.commons.lang.exception.ExceptionUtils;
import org.apache.solr.client.solrj.SolrRequest;
import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.beans.DocumentObjectBinder;
import org.apache.solr.client.solrj.request.QueryRequest;
import org.apache.solr.common.util.NamedList;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.annotations.VisibleForTesting;
/**
* A wrapper around an underlying solrj {@link SolrClient} object; On exceptions, the wrapper
* retries solrj requests according to the given configurable {@link RetryPolicyFactory}.
*
* This is useful in case solr servers are temporarily overloaded, or if there are transient network
* issues, etc.
*
* For example, this class can be configured with a {@link DefaultRetryPolicyFactory} in combination
* with the {@link FlexibleBoundedExponentialBackoffRetry}. Metrics events are forwarded to a
* {@link MetricsFacade} object.
*/
public class RetryingSolrServer extends SolrClient {
private final SolrClient solrServer;
private final RetryPolicyFactory retryPolicyFactory;
private final MetricsFacade metrics;
private final CountDownLatch isShuttingDown = new CountDownLatch(1);
private final RetrySleeper sleeper = new RetrySleeper() {
@Override
public void sleepFor(long time, TimeUnit unit) throws InterruptedException {
isShuttingDown.await(time, unit);
}
};
private final AtomicLong totalRetryCount = new AtomicLong(0);
private final AtomicLong totalRequestCount = new AtomicLong(0);
private static final String ROOT_PREFIX = RetryingSolrServer.class.getSimpleName() + ".";
private static final int MAX_STRING_LENGTH = Integer.parseInt(System.getProperty(
RetryingSolrServer.class.getSimpleName() +".maxStringLength", "300"));
private static final boolean REDACT_QUERY_REQUEST_PARAMS = "true".equals(System.getProperty(
RetryingSolrServer.class.getSimpleName() +".redactQueryRequestParams", "true"));
private static final boolean ENABLE_LOGGING = "true".equals(System.getProperty(
RetryingSolrServer.class.getSimpleName() +".enableLogging", "true"));
private static final Pattern WHITESPACE_EXCEPT_SPACE_PATTERN = Pattern.compile(
"[\\t\\n\\x0B\\f\\r]");
private static final Pattern TWO_OR_MORE_SPACES_PATTERN = Pattern.compile(
" +"); // two or more spaces
private static final Pattern EXCEPTION_HOST_PATTERN = Pattern.compile(
"(UnknownHostException: )([0-9A-Za-z][0-9A-Za-z-]{0,62})((?:\\.[0-9A-Za-z][0-9A-Za-z-]{0,62}){2,}?)");
// Note: %3A%2F%2F is an encoding of :// used in some Solr exception messages
private static final Pattern URL_SCHEME_AND_AUTHORITY_PATTERN = Pattern.compile(
"([A-Za-z][0-9A-Za-z+-.]{0,62}(?:://|%3A%2F%2F))([0-9A-Za-z][0-9A-Za-z-]{0,62})((?:\\.[0-9A-Za-z][0-9A-Za-z-]{0,62})*)");
private static final Pattern SHARD_AND_REPLICA_PATTERN = Pattern.compile(
"(_(?:shard|SHARD))[0-9]+(_(?:replica|REPLICA))[0-9]+");
private static final Pattern COLLECTION_AND_SLICE_PATTERN = Pattern.compile(
"(collection: \\S+ slice: shard)\\d+");
private static final Pattern NOW_TIMESTAMP_PATTERN = Pattern.compile(
"(NOW=)\\d+"); // part of Solr query params
private static final Logger LOG = LoggerFactory.getLogger(RetryingSolrServer.class);
/**
* Constructs an instance with the given parameters.
*
* Caution: retryPolicyFactory and metrics params must be thread-safe if multiple threads are
* concurrently calling the same {@link RetryingSolrServer}.
*/
public RetryingSolrServer(SolrClient solrServer,
RetryPolicyFactory retryPolicyFactory,
MetricsFacade metrics) {
if (solrServer == null) {
throw new IllegalArgumentException("solrServer param must not be null");
}
this.solrServer = solrServer;
if (retryPolicyFactory == null) {
throw new IllegalArgumentException("retryPolicyFactory param must not be null");
}
this.retryPolicyFactory = retryPolicyFactory;
if (metrics == null) {
metrics = new NullMetricsFacade();
}
this.metrics = metrics;
}
/** Returns the backing solr proxy */
public final SolrClient getUnderlyingSolrServer() {
return solrServer;
}
protected final MetricsFacade getMetrics() {
return metrics;
}
/**
* Returns the number of nano seconds to wait until starting execution of this request. This
* default implementation returns zero to indicate that the request shall be started immediately.
*
* Override this method for custom behaviour, e.g. using Guava's RateLimiter.reserve(1).
*/
protected long limitRequestRate(SolrRequest request) {
return 0;
}
@Override
public NamedList<Object> request(final SolrRequest request, String collection) throws SolrServerException, IOException {
int retryCount = 0;
RetryPolicy retryPolicy = null;
totalRequestCount.incrementAndGet();
getMetrics().markMeter(ROOT_PREFIX + "requests", 1);
Map<String, MutableLong> exceptionRootCauseCounters = new HashMap<String, MutableLong>();
final long startTime = System.nanoTime();
while (true) {
final long nanosToWaitForRateLimiter = limitRequestRate(request);
try {
isShuttingDown.await(nanosToWaitForRateLimiter, TimeUnit.NANOSECONDS);
} catch (InterruptedException e) {
; // ignore
}
final long lastStartTime = System.nanoTime();
NamedList response;
try {
response = solrServer.request(request, collection);
} catch (Exception exception) {
long requestDuration = System.nanoTime() - lastStartTime;
String exceptionTopLevelMsg = limitStringLength(getExceptionKey(exception));
String exceptionRootCauseMsg = limitStringLength(getExceptionKey(getRootCause(exception)));
String requestKey = limitStringLength(getRequestKey(request));
handleException(request, exceptionTopLevelMsg, exceptionRootCauseMsg, requestKey, requestDuration);
String reason = "n/a";
boolean retry = !isShuttingDown();
if (!retry) {
reason = "Solrj client is shutting down";
} else {
retryPolicy = retryPolicyFactory.getRetryPolicy(exception, request, solrServer, retryPolicy);
retry = retryPolicy != null;
if (!retry) {
reason = "Exception is non-retryable per RetryPolicyFactory {" + retryPolicyFactory + "}";
} else {
if (ENABLE_LOGGING) {
LOG.warn("Retryable exception received: ", exception);
}
retry = retryPolicy.allowRetry(
retryCount,
System.nanoTime() - startTime,
sleeper);
if (!retry) {
reason = "Request must not be retried " + (retryCount > 0 ? "anymore " : "")
+ "per RetryPolicy {" + retryPolicy + "}";
} else {
retry = !isShuttingDown(); // recheck
if (!retry) {
reason = "Solrj client is shutting down(2)";
}
}
}
}
if (!retry) {
// rethrow exception with added details to help users diagnose the situation
long secs = TimeUnit.NANOSECONDS.toSeconds(System.nanoTime() - startTime);
throw new RetriesExhaustedException(
"Performed " + totalRetryCount + " retries across " + totalRequestCount
+ " client requests. Gave up because the last " + retryCount + " retries across " + secs
+ " seconds for the current request failed with this reason: '" + reason + "' and these"
+ " root causes: ",
exceptionRootCauseCounters,
exception);
}
if (ENABLE_LOGGING) {
LOG.info("Retrying request: {}", requestKey);
}
assert exception != null;
retryCount++;
totalRetryCount.incrementAndGet();
incCounter(exceptionRootCauseCounters, exceptionRootCauseMsg);
metrics.markMeter(ROOT_PREFIX + "retries", 1);
continue; // retry request by continuing while(true) retry loop
}
// solr request succeeded without throwing an exception
long requestDuration = System.nanoTime() - lastStartTime;
handleSuccess(request, response, retryCount, startTime, requestDuration);
return response;
}
}
private void handleSuccess(SolrRequest request, NamedList response, int retryCount, long startTime,
long requestDuration) {
String successPrefix = ROOT_PREFIX + "success.";
if (retryCount > 0) {
String retriedRequestsPrefix = successPrefix + "retry.";
long totalRequestDuration = System.nanoTime() - startTime;
metrics.updateHistogram(retriedRequestsPrefix + "count", retryCount);
metrics.updateTimer(retriedRequestsPrefix + "time", totalRequestDuration, TimeUnit.NANOSECONDS);
// metrics.updateHistogram(retriedRequestsPrefix + "count.req." + requestKey, retryCount);
// metrics.updateHistogram(retriedRequestsPrefix + "count.top." + firstExceptionTopLevelMsg, retryCount);
// metrics.updateHistogram(retriedRequestsPrefix + "count.root." + firstExceptionRootCauseMsg, retryCount);
// metrics.updateTimer(retriedRequestsPrefix + "time.req." + requestKey, totalRequestDuration, TimeUnit.NANOSECONDS);
// metrics.updateTimer(retriedRequestsPrefix + "time.top." + firstExceptionTopLevelMsg, totalRequestDuration, TimeUnit.NANOSECONDS);
// metrics.updateTimer(retriedRequestsPrefix + "time.root." + firstExceptionRootCauseMsg, totalRequestDuration, TimeUnit.NANOSECONDS);
}
metrics.updateHistogram(successPrefix + "anycount", retryCount);
metrics.updateTimer(ROOT_PREFIX + "time", requestDuration, TimeUnit.NANOSECONDS);
}
private void handleException(SolrRequest request, String exceptionTopLevelMsg, String exceptionRootCauseMsg,
String requestKey, long requestDuration) {
String timePrefix = "time.";
metrics.updateTimer(ROOT_PREFIX + "time", requestDuration, TimeUnit.NANOSECONDS);
String failedRequestsPrefix = ROOT_PREFIX + "fail.";
metrics.updateTimer(failedRequestsPrefix + "time", requestDuration, TimeUnit.NANOSECONDS);
metrics.updateTimer(failedRequestsPrefix + timePrefix + "req." + requestKey, requestDuration, TimeUnit.NANOSECONDS);
metrics.updateTimer(failedRequestsPrefix + timePrefix + "xreq.top." + requestKey + "." + exceptionTopLevelMsg, requestDuration, TimeUnit.NANOSECONDS);
metrics.updateTimer(failedRequestsPrefix + timePrefix + "xreq.root." + requestKey + "." + exceptionRootCauseMsg, requestDuration, TimeUnit.NANOSECONDS);
metrics.updateTimer(failedRequestsPrefix + timePrefix + "top." + exceptionTopLevelMsg, requestDuration, TimeUnit.NANOSECONDS);
metrics.updateTimer(failedRequestsPrefix + timePrefix + "root." + exceptionRootCauseMsg, requestDuration, TimeUnit.NANOSECONDS);
}
protected String getRequestKey(SolrRequest request) {
String str = request.getClass().getSimpleName() + ".";
return REDACT_QUERY_REQUEST_PARAMS && request instanceof QueryRequest ?
str + "<redacted>" :
str + normalizeRequestKey(String.valueOf(request.getParams()));
}
@VisibleForTesting
final String normalizeRequestKey(String str) {
str = NOW_TIMESTAMP_PATTERN.matcher(str).replaceAll("$1<redacted>");
str = SHARD_AND_REPLICA_PATTERN.matcher(str).replaceAll("$1<redacted>$2<redacted>");
str = normalizeWhitespace(str);
return str;
}
/**
* Make metric names a bit more human readable.
* Examples:
* "foo bar" -> "foo bar"
* "foo\nbar" -> "foo bar"
* "foo\n\tbar" -> "foo bar"
*/
private String normalizeWhitespace(String str) {
str = WHITESPACE_EXCEPT_SPACE_PATTERN.matcher(str).replaceAll(" ");
str = TWO_OR_MORE_SPACES_PATTERN.matcher(str).replaceAll(" ");
return str;
}
protected String getExceptionKey(Throwable exception) {
return normalizeExceptionKey(exception.toString());
}
/**
* Coalesce exception messages containing HTTP REST calls to all server shards into a single bucket.
* This greatly reduces the amount of metric spam.
* Example: https://vb0706.foo.bar.com:8983 --> https://<redacted>.foo.bar.com:8983
* Example: UnknownHostException: vb0706.foo.bar.com -> UnknownHostException: <redacted>.foo.bar.com
* Example: insertAndQueryTweets_shard30_replica91 --> insertAndQueryTweets_shard<redacted>_replica<redacted>
* Example: collection: collection1 slice: shard130 --> collection: collection1 slice: shard<redacted>
* Example: &NOW=1448689151133 -> &NOW=<redacted>
*/
private String normalizeExceptionKey(String str) {
str = URL_SCHEME_AND_AUTHORITY_PATTERN.matcher(str).replaceAll("$1<redacted>$3");
str = SHARD_AND_REPLICA_PATTERN.matcher(str).replaceAll("$1<redacted>$2<redacted>");
str = EXCEPTION_HOST_PATTERN.matcher(str).replaceAll("$1<redacted>$3");
str = COLLECTION_AND_SLICE_PATTERN.matcher(str).replaceAll("$1<redacted>");
str = NOW_TIMESTAMP_PATTERN.matcher(str).replaceAll("$1<redacted>");
str = normalizeWhitespace(str);
str = omitPackageName(str, "java.io.");
str = omitPackageName(str, "java.net.");
str = omitPackageName(str, "org.apache.solr.client.solrj.impl.");
//str = omitPackageName(str, "org.apache.http.client.");
return str;
}
private String omitPackageName(String str, String prefix) {
return str.startsWith(prefix) ? str.substring(prefix.length()) : str;
}
/** prevents gigantic messages */
private String limitStringLength(String str) {
if (str.length() > MAX_STRING_LENGTH) {
str = str.substring(0, MAX_STRING_LENGTH) + " ...";
}
return str;
}
private void incCounter(Map<String, MutableLong> counters, String key) {
MutableLong counter = counters.get(key);
if (counter == null) {
counter = new MutableLong(0);
counters.put(key, counter);
}
counter.inc(1);
}
private Throwable getRootCause(Throwable throwable) {
Throwable rootCause = ExceptionUtils.getRootCause(throwable);
if (rootCause != null) {
return rootCause;
} else {
return throwable;
}
}
private boolean isShuttingDown() {
try {
return isShuttingDown.await(0, TimeUnit.NANOSECONDS);
} catch (InterruptedException e) {
return true;
}
}
@Override
public DocumentObjectBinder getBinder() {
return solrServer.getBinder();
}
@Override
public void close() {
isShuttingDown.countDown();
try {
solrServer.close();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
///////////////////////////////////////////////////////////////////////////////
// Nested classes:
///////////////////////////////////////////////////////////////////////////////
/**
* A {@link MetricsFacade} implementation that ignores all metrics events - does nothing.
*/
private static final class NullMetricsFacade implements MetricsFacade {
@Override
public void markMeter(String name, long increment) {}
@Override
public void updateHistogram(String name, long value) {}
@Override
public void updateTimer(String name, long duration, TimeUnit unit) {}
};
}