/* * Copyright 2016 the original author or authors. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License */ package io.atomix.cluster; import io.atomix.AtomixReplica; import io.atomix.catalyst.annotations.Experimental; import io.atomix.catalyst.util.Assert; import io.atomix.copycat.error.ConfigurationException; import io.atomix.copycat.server.cluster.Cluster; import io.atomix.copycat.server.cluster.Member; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.Collection; import java.util.Optional; import java.util.concurrent.CompletableFuture; import java.util.function.BiConsumer; import java.util.function.Function; import java.util.stream.Collectors; /** * Cluster manager implementation that automatically balances the cluster. * * @author <a href="http://github.com/kuujo>Jordan Halterman</a> */ @Experimental public class BalancingClusterManager implements ClusterManager { /** * Returns a new balancing cluster manager builder. * * @return A new balancing cluster manager builder. */ public static Builder builder() { return new Builder(); } private static final Logger LOGGER = LoggerFactory.getLogger(BalancingClusterManager.class); private final int quorumHint; private final int backupCount; private boolean closed; public BalancingClusterManager(int quorumHint, int backupCount) { this.quorumHint = quorumHint; this.backupCount = backupCount; } @Override public CompletableFuture<Void> start(Cluster cluster, AtomixReplica replica) { cluster.members().forEach(m -> { m.onTypeChange(t -> balance(cluster)); m.onStatusChange(s -> balance(cluster)); }); cluster.onLeaderElection(l -> balance(cluster)); cluster.onJoin(m -> { m.onTypeChange(t -> balance(cluster)); m.onStatusChange(s -> balance(cluster)); balance(cluster); }); cluster.onLeave(m -> balance(cluster)); return null; } /** * Balances the given cluster. * * @param cluster The cluster to rebalance. * @return A completable future to be completed once the cluster has been balanced. */ public CompletableFuture<Void> balance(Cluster cluster) { if (cluster.member().equals(cluster.leader())) { LOGGER.info("Balancing cluster..."); return balance(cluster, new CompletableFuture<>()); } return CompletableFuture.completedFuture(null); } /** * Balances the cluster, recursively promoting/demoting members as necessary. */ private CompletableFuture<Void> balance(Cluster cluster, CompletableFuture<Void> future) { if (closed) { future.completeExceptionally(new IllegalStateException("balancer closed")); return future; } Collection<Member> members = cluster.members(); Member member = cluster.member(); Collection<Member> active = members.stream().filter(m -> m.type() == Member.Type.ACTIVE).collect(Collectors.toList()); Collection<Member> passive = members.stream().filter(m -> m.type() == Member.Type.PASSIVE).collect(Collectors.toList()); Collection<Member> reserve = members.stream().filter(m -> m.type() == Member.Type.RESERVE).collect(Collectors.toList()); int totalActiveCount = active.size(); int totalPassiveCount = passive.size(); long availableActiveCount = active.stream().filter(m -> m.status() == Member.Status.AVAILABLE).count(); long availablePassiveCount = passive.stream().filter(m -> m.status() == Member.Status.AVAILABLE).count(); long availableReserveCount = reserve.stream().filter(m -> m.status() == Member.Status.AVAILABLE).count(); BiConsumer<Void, Throwable> completeFunction = (result, error) -> { if (error == null || error.getCause() instanceof ConfigurationException) { balance(cluster, future); } else { future.completeExceptionally(error); } }; // If the number of available active members is less than the quorum hint, promote a passive or reserve member. if (quorumHint == Quorum.ALL.size() || availableActiveCount < quorumHint) { // If a passive member is available, promote it. if (availablePassiveCount > 0) { Member promote = passive.stream().filter(m -> m.status() == Member.Status.AVAILABLE).findFirst().get(); LOGGER.info("Promoting {} to ACTIVE: not enough active members", promote.address()); promote.promote(Member.Type.ACTIVE).whenComplete(completeFunction); return future; } // If a reserve member is available, promote it. else if (availableReserveCount > 0) { Member promote = reserve.stream().filter(m -> m.status() == Member.Status.AVAILABLE).findFirst().get(); LOGGER.info("Promoting {} to ACTIVE: not enough active members", promote.address()); promote.promote(Member.Type.ACTIVE).whenComplete(completeFunction); return future; } } // If the total number of active members is greater than the quorum hint, demote an active member. // Preferably, we want to demote a member that is unavailable. if (quorumHint != Quorum.ALL.size() && totalActiveCount > quorumHint) { // If the number of available passive members is less than the required number, demote an active // member to passive. if (availablePassiveCount < (quorumHint - 1) * backupCount) { Member demote = active.stream().filter(m -> m.status() == Member.Status.UNAVAILABLE).findFirst() .orElseGet(() -> active.stream().filter(m -> !m.equals(member)).findAny().get()); LOGGER.info("Demoting {} to PASSIVE: too many active members", demote.address()); demote.demote(Member.Type.PASSIVE).whenComplete(completeFunction); return future; } // Otherwise, demote an active member to reserve. else { Member demote = active.stream().filter(m -> m.status() == Member.Status.UNAVAILABLE).findAny() .orElseGet(() -> active.stream().filter(m -> !m.equals(member)).findAny().get()); LOGGER.info("Demoting {} to RESERVE: too many active members", demote.address()); demote.demote(Member.Type.RESERVE).whenComplete(completeFunction); return future; } } // If the number of available passive members is less than the required number of passive members, // promote a reserve member. if (quorumHint != Quorum.ALL.size() && availablePassiveCount < (quorumHint - 1) * backupCount) { // If any reserve members are available, promote to passive. if (availableReserveCount > 0) { Member promote = reserve.stream().filter(m -> m.status() == Member.Status.AVAILABLE).findFirst().get(); LOGGER.info("Promoting {} to PASSIVE: not enough passive members", promote.address()); promote.promote(Member.Type.PASSIVE).whenComplete(completeFunction); return future; } } // If the total number of passive members is greater than the required number of passive members, // demote a passive member. Preferably we demote an unavailable member. if (quorumHint != Quorum.ALL.size() && totalPassiveCount > (quorumHint - 1) * backupCount) { Member demote = passive.stream().filter(m -> m.status() == Member.Status.UNAVAILABLE).findAny() .orElseGet(() -> passive.stream().findAny().get()); LOGGER.info("Demoting {} to RESERVE: too many passive members", demote.address()); demote.demote(Member.Type.RESERVE).whenComplete(completeFunction); return future; } // If we've made it this far then the cluster is balanced. future.complete(null); return future; } @Override public CompletableFuture<Void> stop(Cluster cluster, AtomixReplica replica) { LOGGER.debug("Balancing cluster..."); return replace(cluster, new CompletableFuture<>()).whenComplete((result, error) -> closed = true); } /** * Replaces the local member in the cluster. */ private CompletableFuture<Void> replace(Cluster cluster, CompletableFuture<Void> future) { if (closed) { future.completeExceptionally(new IllegalStateException("cluster balancer closed")); return future; } BiConsumer<Void, Throwable> completeFunction = (result, error) -> { if (error == null) { future.complete(null); } else if (error.getCause() instanceof ConfigurationException) { replace(cluster, future); } else { future.completeExceptionally(error); } }; Function<Void, CompletableFuture<Void>> demoteFunction = v -> { long passiveCount = cluster.members().stream().filter(m -> m.type() == Member.Type.PASSIVE).count(); if (passiveCount < (quorumHint - 1) * backupCount) { LOGGER.info("Demoting {} to PASSIVE", cluster.member().address()); return cluster.member().demote(Member.Type.PASSIVE); } else { LOGGER.info("Demoting {} to RESERVE", cluster.member().address()); return cluster.member().demote(Member.Type.RESERVE); } }; // If the quorum hint is ALL, don't replace the replica. if (quorumHint == Quorum.ALL.size()) { return CompletableFuture.completedFuture(null); } // If the local member is active, replace it with a passive or reserve member. if (cluster.member().type() == Member.Type.ACTIVE) { // Get a list of passive members. Collection<Member> passive = cluster.members().stream() .filter(m -> m.type() == Member.Type.PASSIVE) .collect(Collectors.toList()); // Get a list of reserve members. Collection<Member> reserve = cluster.members().stream() .filter(m -> m.type() == Member.Type.RESERVE) .collect(Collectors.toList()); // Attempt to promote an available passive member. if (!passive.isEmpty()) { Optional<Member> optionalMember = passive.stream().filter(m -> m.status() == Member.Status.AVAILABLE).findFirst(); if (optionalMember.isPresent()) { LOGGER.info("Promoting {} to ACTIVE: replacing {}", optionalMember.get().address(), cluster.member().address()); optionalMember.get().promote(Member.Type.ACTIVE) .thenCompose(demoteFunction) .whenComplete(completeFunction); return future; } } // Attempt to promote an available reserve member. if (!reserve.isEmpty()) { Optional<Member> optionalMember = reserve.stream().filter(m -> m.status() == Member.Status.AVAILABLE).findFirst(); if (optionalMember.isPresent()) { LOGGER.info("Promoting {} to ACTIVE: replacing {}", optionalMember.get().address(), cluster.member().address()); optionalMember.get().promote(Member.Type.ACTIVE) .thenCompose(demoteFunction) .whenComplete(completeFunction); return future; } } // Promote an unavailable passive or reserve member. if (!passive.isEmpty()) { Member member = passive.iterator().next(); LOGGER.info("Promoting {} to ACTIVE: replacing {}", member.address(), cluster.member().address()); member.promote(Member.Type.ACTIVE) .thenCompose(demoteFunction) .whenComplete(completeFunction); } else if (!reserve.isEmpty()) { Member member = reserve.iterator().next(); LOGGER.info("Promoting {} to ACTIVE: replacing {}", member.address(), cluster.member().address()); member.promote(Member.Type.ACTIVE) .thenCompose(demoteFunction) .whenComplete(completeFunction); } else { future.complete(null); } } else if (cluster.member().type() == Member.Type.PASSIVE) { Collection<Member> reserve = cluster.members().stream() .filter(m -> m.type() == Member.Type.RESERVE) .collect(Collectors.toList()); if (!reserve.isEmpty()) { Optional<Member> optionalMember = reserve.stream().filter(m -> m.status() == Member.Status.AVAILABLE).findFirst(); if (optionalMember.isPresent()) { LOGGER.info("Promoting {} to PASSIVE: replacing {}", optionalMember.get().address(), cluster.member().address()); optionalMember.get().promote(Member.Type.PASSIVE) .thenCompose(demoteFunction) .whenComplete(completeFunction); } else { Member member = reserve.iterator().next(); LOGGER.info("Promoting {} to PASSIVE: replacing {}", member.address(), cluster.member().address()); member.promote(Member.Type.PASSIVE) .thenCompose(demoteFunction) .whenComplete(completeFunction); } } else { future.complete(null); } } else { future.complete(null); } return future; } /** * Balancing cluster manager builder. */ public static class Builder implements ClusterManager.Builder { private int quorumHint = Quorum.ALL.size(); private int backupCount = 0; /** * Sets the cluster quorum hint. * <p> * The quorum hint specifies the optimal number of replicas to actively participate in the Raft * consensus algorithm. As long as there are at least {@code quorumHint} replicas in the cluster, * Atomix will automatically balance replicas to ensure that at least {@code quorumHint} replicas * are active participants in the Raft algorithm at any given time. Replicas can be added to or * removed from the cluster at will, and remaining replicas will be transparently promoted and demoted * as necessary to maintain the desired quorum size. * <p> * The size of the quorum is relevant both to performance and fault-tolerance. When resources are * created or deleted or resource state changes are submitted to the cluster, Atomix will synchronously * replicate changes to a majority of the cluster before they can be committed and update state. For * example, in a cluster where the {@code quorumHint} is {@code 3}, a * {@link io.atomix.collections.DistributedMap#put(Object, Object)} command must be sent to the leader * and then synchronously replicated to one other replica before it can be committed and applied to the * map state machine. This also means that a cluster with {@code quorumHint} equal to {@code 3} can tolerate * at most one failure. * <p> * Users should set the {@code quorumHint} to an odd number of replicas or use one of the {@link Quorum} * magic constants for the greatest level of fault tolerance. Typically, in write-heavy workloads, the most * performant configuration will be a {@code quorumHint} of {@code 3}. In read-heavy workloads, quorum hints * of {@code 3} or {@code 5} can be used depending on the size of the cluster and desired level of fault tolerance. * Additional active replicas may or may not improve read performance depending on usage and in particular * {@link io.atomix.resource.ReadConsistency read consistency} levels. * * @param quorumHint The quorum hint. This must be the same on all replicas in the cluster. * @return The replica builder. * @throws IllegalArgumentException if the quorum hint is less than {@code -1} */ public Builder withQuorumHint(int quorumHint) { this.quorumHint = Assert.argNot(quorumHint, quorumHint < -1, "quorumHint must be positive or -1"); return this; } /** * Sets the cluster quorum hint. * <p> * The quorum hint specifies the optimal number of replicas to actively participate in the Raft * consensus algorithm. As long as there are at least {@code quorumHint} replicas in the cluster, * Atomix will automatically balance replicas to ensure that at least {@code quorumHint} replicas * are active participants in the Raft algorithm at any given time. Replicas can be added to or * removed from the cluster at will, and remaining replicas will be transparently promoted and demoted * as necessary to maintain the desired quorum size. * <p> * By default, the configured quorum hint is {@link Quorum#ALL}. * <p> * The size of the quorum is relevant both to performance and fault-tolerance. When resources are * created or deleted or resource state changes are submitted to the cluster, Atomix will synchronously * replicate changes to a majority of the cluster before they can be committed and update state. For * example, in a cluster where the {@code quorumHint} is {@code 3}, a * {@link io.atomix.collections.DistributedMap#put(Object, Object)} command must be sent to the leader * and then synchronously replicated to one other replica before it can be committed and applied to the * map state machine. This also means that a cluster with {@code quorumHint} equal to {@code 3} can tolerate * at most one failure. * <p> * Users should set the {@code quorumHint} to an odd number of replicas or use one of the {@link Quorum} * magic constants for the greatest level of fault tolerance. Typically, in write-heavy workloads, the most * performant configuration will be a {@code quorumHint} of {@code 3}. In read-heavy workloads, quorum hints * of {@code 3} or {@code 5} can be used depending on the size of the cluster and desired level of fault tolerance. * Additional active replicas may or may not improve read performance depending on usage and in particular * {@link io.atomix.resource.ReadConsistency read consistency} levels. * * @param quorum The quorum hint. This must be the same on all replicas in the cluster. * @return The replica builder. * @throws NullPointerException if the quorum hint is null */ public Builder withQuorumHint(Quorum quorum) { this.quorumHint = Assert.notNull(quorum, "quorum").size(); return this; } /** * Sets the replica backup count. * <p> * The backup count specifies the maximum number of replicas per {@link #withQuorumHint(int) active} replica * to participate in asynchronous replication of state. Backup replicas allow quorum-member replicas to be * more quickly replaced in the event of a failure or an active replica leaving the cluster. Additionally, * backup replicas may service {@link io.atomix.resource.ReadConsistency#SEQUENTIAL SEQUENTIAL} and * {@link io.atomix.resource.ReadConsistency#LOCAL LOCAL} reads to allow read operations to be further * spread across the cluster. * <p> * The backup count is used to calculate the number of backup replicas per non-leader active member. The * number of actual backups is calculated by {@code (quorumHint - 1) * backupCount}. If the * {@code backupCount} is {@code 1} and the {@code quorumHint} is {@code 3}, the number of backup replicas * will be {@code 2}. * <p> * By default, the backup count is {@code 0}, indicating no backups should be maintained. However, it is * recommended that each cluster have at least a backup count of {@code 1} to ensure active replicas can * be quickly replaced in the event of a network partition or other failure. Quick replacement of active * member nodes improves fault tolerance in cases where a majority of the active members in the cluster are * not lost simultaneously. * * @param backupCount The number of backup replicas per active replica. This must be the same on all * replicas in the cluster. * @return The replica builder. * @throws IllegalArgumentException if the {@code backupCount} is negative */ public Builder withBackupCount(int backupCount) { this.backupCount = Assert.argNot(backupCount, backupCount < 0, "backupCount must be positive"); return this; } @Override public ClusterManager build() { return new BalancingClusterManager(quorumHint, backupCount); } } }