package netflix.ocelli.retrys; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import netflix.ocelli.functions.Metrics; import netflix.ocelli.functions.Retrys; import netflix.ocelli.functions.Stopwatches; import netflix.ocelli.util.SingleMetric; import netflix.ocelli.util.Stopwatch; import rx.Observable; import rx.Observable.OnSubscribe; import rx.Observable.Transformer; import rx.Scheduler; import rx.Subscriber; import rx.functions.Action1; import rx.functions.Func0; import rx.functions.Func1; import rx.schedulers.Schedulers; /** * Retry strategy that kicks off a second request if the first request does not * respond within an expected amount of time. The original request remains in * flight until either one responds. The strategy tracks response latencies and * feeds them into a SingleMetric that is used to determine the backup request * timeout. A common metric to use is the 90th percentile response time. * * Note that the same BackupRequestRetryStrategy instance is stateful and should * be used for all requests. Multiple BackupRequestRetryStrategy instances may be * used for different request types known to have varying response latency * distributions. * * Usage, * * {@code * <pre> * * BackupRequestRetryStrategy strategy = BackupRequestRetryStrategy.builder() * .withTimeoutMetric(Metrics.quantile(0.90)) * .withIsRetriablePolicy(somePolicyThatReturnsTrueOnRetriableErrors) * .build(); * * loadBalancer * .flatMap(operation) * .compose(strategy) * .subscribe(responseHandler) * </pre> * code} * * @author elandau * * @param <T> */ public class BackupRequestRetryStrategy<T> implements Transformer<T, T> { public static Func0<Stopwatch> DEFAULT_CLOCK = Stopwatches.systemNano(); private final Func0<Stopwatch> sw; private final SingleMetric<Long> metric; private final Func1<Throwable, Boolean> retriableError; private final Scheduler scheduler; public static class Builder<T> { private Func0<Stopwatch> sw = DEFAULT_CLOCK; private SingleMetric<Long> metric = Metrics.memoize(10L); private Func1<Throwable, Boolean> retriableError = Retrys.ALWAYS; private Scheduler scheduler = Schedulers.computation(); /** * Function to determine if an exception is retriable or not. A non * retriable exception will result in an immediate error being returned * while the first retriable exception on either the primary or secondary * request will be ignored to allow the other request to complete. * @param retriableError */ public Builder<T> withIsRetriablePolicy(Func1<Throwable, Boolean> retriableError) { this.retriableError = retriableError; return this; } /** * Function to determine the backup request timeout for each operation. * @param func * @param units */ public Builder<T> withTimeoutMetric(SingleMetric<Long> metric) { this.metric = metric; return this; } /** * Provide an external scheduler to drive the backup timeout. Use this * to test with a TestScheduler * * @param scheduler */ public Builder<T> withScheduler(Scheduler scheduler) { this.scheduler = scheduler; return this; } /** * Factory for creating stopwatches. A new stopwatch is created per operation. * @param clock */ public Builder<T> withStopwatch(Func0<Stopwatch> sw) { this.sw = sw; return this; } public BackupRequestRetryStrategy<T> build() { return new BackupRequestRetryStrategy<T>(this); } } public static <T> Builder<T> builder() { return new Builder<T>(); } private BackupRequestRetryStrategy(Builder<T> builder) { this.metric = builder.metric; this.retriableError = builder.retriableError; this.scheduler = builder.scheduler; this.sw = builder.sw; } @Override public Observable<T> call(final Observable<T> o) { Observable<T> timedO = Observable.create(new OnSubscribe<T>() { @Override public void call(Subscriber<? super T> s) { final Stopwatch timer = sw.call(); o.doOnNext(new Action1<T>() { @Override public void call(T t1) { metric.add(timer.elapsed(TimeUnit.MILLISECONDS)); } }).subscribe(s); } }); return Observable .just(timedO, timedO.delaySubscription(metric.get(), TimeUnit.MILLISECONDS, scheduler)) .flatMap(new Func1<Observable<T>, Observable<T>>() { final AtomicInteger counter = new AtomicInteger(); @Override public Observable<T> call(Observable<T> t1) { return t1.onErrorResumeNext(new Func1<Throwable, Observable<T>>() { @Override public Observable<T> call(Throwable e) { if (counter.incrementAndGet() == 2 || !retriableError.call(e)) { return Observable.error(e); } return Observable.never(); } }); } }) .take(1); } }