/* * Copyright (C) 2012-2015 DataStax Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.datastax.driver.core.policies; import com.datastax.driver.core.*; import com.datastax.driver.core.exceptions.*; import com.google.common.annotations.Beta; import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.AbstractIterator; import com.google.common.collect.ImmutableList; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.Collection; import java.util.Iterator; import java.util.List; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; import java.util.concurrent.TimeUnit; import static java.util.concurrent.TimeUnit.MINUTES; import static java.util.concurrent.TimeUnit.NANOSECONDS; /** * Chainable load balancing policy that filters out hosts based on their error rates. * <p/> * When creating a query plan, this policy gathers a list of candidate hosts from its child policy; for each candidate * host, it then determines whether it should be included into or excluded from the final query plan, based on its * current error rate (measured over the last minute, with a 5-second granularity). * <p/> * Note that the policy should not blindly count all errors in its measurements: some type of errors (e.g. CQL syntax * errors) can originate from the client and occur on all hosts, therefore they should not count towards the exclusion * threshold or all hosts could become excluded. You can provide your own {@link ErrorFilter} to customize that logic. * <p/> * The policy follows the builder pattern to be created, the {@link Builder} class can be created with * {@link #builder} method. * <p/> * This policy is currently in BETA mode and its behavior might be changing throughout different driver versions. */ @Beta public class ErrorAwarePolicy implements ChainableLoadBalancingPolicy { private static final Logger logger = LoggerFactory.getLogger(ErrorAwarePolicy.class); private final LoadBalancingPolicy childPolicy; private final long retryPeriodNanos; PerHostErrorTracker errorTracker; private ErrorAwarePolicy(Builder builder) { this.childPolicy = builder.childPolicy; this.retryPeriodNanos = builder.retryPeriodNanos; this.errorTracker = new PerHostErrorTracker(builder.maxErrorsPerMinute, builder.errorFilter, builder.clock); } @Override public LoadBalancingPolicy getChildPolicy() { return childPolicy; } @Override public void init(Cluster cluster, Collection<Host> hosts) { childPolicy.init(cluster, hosts); cluster.register(this.errorTracker); } @Override public HostDistance distance(Host host) { return childPolicy.distance(host); } @Override public Iterator<Host> newQueryPlan(String loggedKeyspace, Statement statement) { final Iterator<Host> childQueryPlan = childPolicy.newQueryPlan(loggedKeyspace, statement); return new AbstractIterator<Host>() { @Override protected Host computeNext() { while (childQueryPlan.hasNext()) { Host host = childQueryPlan.next(); if (!errorTracker.isExcluded(host)) { return host; } } return endOfData(); } }; } @Override public void onAdd(Host host) { childPolicy.onAdd(host); } @Override public void onUp(Host host) { childPolicy.onUp(host); } @Override public void onDown(Host host) { childPolicy.onDown(host); } @Override public void onRemove(Host host) { childPolicy.onRemove(host); } /** * Creates a new error aware policy builder given the child policy * that the resulting policy should wrap. * * @param childPolicy the load balancing policy to wrap with error * awareness. * @return the created builder. */ public static Builder builder(LoadBalancingPolicy childPolicy) { return new Builder(childPolicy); } @Override public void close() { childPolicy.close(); } /** * Utility class to create a {@link ErrorAwarePolicy}. */ public static class Builder { final LoadBalancingPolicy childPolicy; private int maxErrorsPerMinute = 1; private long retryPeriodNanos = NANOSECONDS.convert(2, MINUTES); private Clock clock = Clock.DEFAULT; private ErrorFilter errorFilter = new DefaultErrorFilter(); /** * Creates a {@link Builder} instance. * * @param childPolicy the load balancing policy to wrap with error * awareness. */ public Builder(LoadBalancingPolicy childPolicy) { this.childPolicy = childPolicy; } /** * Defines the maximum number of errors allowed per minute for each host. * <p/> * The policy keeps track of the number of errors on each host (filtered by * {@link Builder#withErrorsFilter(com.datastax.driver.core.policies.ErrorAwarePolicy.ErrorFilter)}) * over a sliding 1-minute window. If a host had more than this number * of errors, it will be excluded from the query plan for the duration defined by * {@link #withRetryPeriod(long, TimeUnit)}. * <p/> * Default value for the threshold is 1. * * @param maxErrorsPerMinute the number. * @return this {@link Builder} instance, for method chaining. */ public Builder withMaxErrorsPerMinute(int maxErrorsPerMinute) { this.maxErrorsPerMinute = maxErrorsPerMinute; return this; } /** * Defines the time during which a host is excluded by the policy once it has exceeded * {@link #withMaxErrorsPerMinute(int)}. * <p/> * Default value for the retry period is 2 minutes. * * @param retryPeriod the period of exclusion for a host. * @param retryPeriodTimeUnit the time unit for the retry period. * @return this {@link Builder} instance, for method chaining. */ public Builder withRetryPeriod(long retryPeriod, TimeUnit retryPeriodTimeUnit) { this.retryPeriodNanos = retryPeriodTimeUnit.toNanos(retryPeriod); return this; } /** * Provides a filter that will decide which errors are counted towards {@link #withMaxErrorsPerMinute(int)}. * <p/> * The default implementation will exclude from the error counting, the following exception types: * <ul> * <li>{@link QueryConsistencyException} and {@link UnavailableException}: the assumption is that these errors * are most often caused by other replicas being unavailable, not by something wrong on the coordinator;</li> * <li>{@link InvalidQueryException}, {@link AlreadyExistsException}, {@link SyntaxError}: these are likely * caused by a bad query in client code, that will fail on all hosts. Excluding hosts could lead to complete * loss of connectivity, rather the solution is to fix the query;</li> * <li>{@link FunctionExecutionException}: similarly, this is caused by a bad function definition and likely to * fail on all hosts.</li> * </ul> * * @param errorFilter the filter class that the policy will use. * @return this {@link Builder} instance, for method chaining. */ public Builder withErrorsFilter(ErrorFilter errorFilter) { this.errorFilter = errorFilter; return this; } @VisibleForTesting Builder withClock(Clock clock) { this.clock = clock; return this; } /** * Creates the {@link ErrorAwarePolicy} instance. * * @return the newly created {@link ErrorAwarePolicy}. */ public ErrorAwarePolicy build() { return new ErrorAwarePolicy(this); } } class PerHostErrorTracker implements LatencyTracker { private final int maxErrorsPerMinute; private final ErrorFilter errorFilter; private final Clock clock; private final ConcurrentMap<Host, RollingCount> hostsCounts = new ConcurrentHashMap<Host, RollingCount>(); private final ConcurrentMap<Host, Long> exclusionTimes = new ConcurrentHashMap<Host, Long>(); PerHostErrorTracker(int maxErrorsPerMinute, ErrorFilter errorFilter, Clock clock) { this.maxErrorsPerMinute = maxErrorsPerMinute; this.errorFilter = errorFilter; this.clock = clock; } @Override public void update(Host host, Statement statement, Exception exception, long newLatencyNanos) { if (exception == null) { return; } if (!errorFilter.shouldConsiderError(exception, host, statement)) { return; } RollingCount hostCount = getOrCreateCount(host); hostCount.increment(); } boolean isExcluded(Host host) { Long excludedTime = exclusionTimes.get(host); boolean expired = excludedTime != null && clock.nanoTime() - excludedTime >= retryPeriodNanos; if (excludedTime == null || expired) { if (maybeExcludeNow(host, excludedTime)) { return true; } if (expired) { // Cleanup, but make sure we don't overwrite if another thread just set it exclusionTimes.remove(host, excludedTime); } return false; } else { // host is already excluded return true; } } // Exclude if we're over the threshold private boolean maybeExcludeNow(Host host, Long previousTime) { RollingCount rollingCount = getOrCreateCount(host); long count = rollingCount.get(); if (count > maxErrorsPerMinute) { excludeNow(host, count, previousTime); return true; } else { return false; } } // Set the exclusion time to now, handling potential races private void excludeNow(Host host, long count, Long previousTime) { long now = clock.nanoTime(); boolean didNotRace = (previousTime == null) ? exclusionTimes.putIfAbsent(host, now) == null : exclusionTimes.replace(host, previousTime, now); if (didNotRace && logger.isDebugEnabled()) { logger.debug(String.format("Host %s encountered %d errors in the last minute, which is more " + "than the maximum allowed (%d). It will be excluded from query plans for the " + "next %d nanoseconds.", host, count, maxErrorsPerMinute, retryPeriodNanos)); } } private RollingCount getOrCreateCount(Host host) { RollingCount hostCount = hostsCounts.get(host); if (hostCount == null) { RollingCount tmp = new RollingCount(clock); hostCount = hostsCounts.putIfAbsent(host, tmp); if (hostCount == null) hostCount = tmp; } return hostCount; } @Override public void onRegister(Cluster cluster) { // nothing to do. } @Override public void onUnregister(Cluster cluster) { // nothing to do. } } static class DefaultErrorFilter implements ErrorFilter { private static final List<Class<? extends Exception>> IGNORED_EXCEPTIONS = ImmutableList.<Class<? extends Exception>>builder() .add(FunctionExecutionException.class) .add(QueryConsistencyException.class) .add(UnavailableException.class) .add(AlreadyExistsException.class) .add(InvalidQueryException.class) .add(SyntaxError.class) .build(); @Override public boolean shouldConsiderError(Exception e, Host host, Statement statement) { for (Class<? extends Exception> ignoredException : IGNORED_EXCEPTIONS) { if (ignoredException.isInstance(e)) return false; } return true; } } /** * A filter for the errors considered by {@link ErrorAwarePolicy}. * <p/> * Only errors that indicate something wrong with a host should lead to its exclusion from query plans. */ public interface ErrorFilter { /** * Whether an error should be counted in the host's error rate. * * @param e the exception. * @param host the host. * @param statement the statement that caused the exception. * @return {@code true} if the exception should be counted. */ boolean shouldConsiderError(Exception e, Host host, Statement statement); } }