BalancingClusterManager.java example

Explorer

atomix-master
- all
  - src
    - main
      - java
        io
        atomix
        bundle
        AtomixBundle.java
        package-info.java
- collections
  - src
    - main
      - java
        io
        atomix
        collections
        DistributedMap.java
        DistributedMultiMap.java
        DistributedQueue.java
        DistributedSet.java
        internal
        MapCommands.java
        MapEntry.java
        MapState.java
        MultiMapCommands.java
        MultiMapState.java
        QueueCommands.java
        QueueState.java
        SetCommands.java
        SetState.java
        package-info.java
        util
        DistributedMapFactory.java
        DistributedMultiMapFactory.java
        DistributedQueueFactory.java
        DistributedSetFactory.java
    - test
      - java
        io
        atomix
        collections
        DistributedMapTest.java
        DistributedMultiMapTest.java
        DistributedQueueTest.java
        DistributedSetTest.java
- concurrent
  - src
    - main
      - java
        io
        atomix
        concurrent
        DistributedLock.java
        internal
        LockCommands.java
        LockState.java
        package-info.java
        util
        DistributedLockFactory.java
    - test
      - java
        io
        atomix
        concurrent
        DistributedLockTest.java
- core
  - src
    - main
      - java
        io
        atomix
        Atomix.java
        AtomixClient.java
        AtomixReplica.java
        cluster
        BalancingClusterManager.java
        ClusterManager.java
        Quorum.java
        package-info.java
    - test
      - java
        io
        atomix
        AbstractAtomixTest.java
        AtomixClientTest.java
        AtomixGroupTest.java
        AtomixLockTest.java
        AtomixLongTest.java
        AtomixMapTest.java
        AtomixMultiMapTest.java
        AtomixQueueTest.java
        AtomixReplicaTest.java
        AtomixSetTest.java
        ResourceRecoveryTest.java
- examples
  - distributed-group
    - src
      - main
        java
        io
        atomix
        examples
        group
        DistributedGroupExample.java
  - distributed-lock
    - src
      - main
        java
        io
        atomix
        examples
        distributedlock
        DistributedLockExample.java
  - distributed-value
    - src
      - main
        java
        io
        atomix
        examples
        variables
        DistributedValueExample.java
  - leader-election
    - src
      - main
        java
        io
        atomix
        examples
        election
        LeaderElectionExample.java
- groups
  - src
    - main
      - java
        io
        atomix
        group
        DistributedGroup.java
        DistributedGroupFactory.java
        GroupMember.java
        LocalMember.java
        election
        Election.java
        Term.java
        internal
        GroupElection.java
        GroupTerm.java
        internal
        AbstractGroupMember.java
        AsyncMessageState.java
        GroupCommands.java
        GroupMemberInfo.java
        GroupState.java
        LocalGroupMember.java
        MemberState.java
        MembersState.java
        MembershipGroup.java
        MessageState.java
        QueueState.java
        RemoteGroupMember.java
        RequestReplyMessageState.java
        SessionState.java
        SyncMessageState.java
        messaging
        Message.java
        MessageClient.java
        MessageConsumer.java
        MessageFailedException.java
        MessageProducer.java
        MessageService.java
        internal
        AbstractMessageClient.java
        AbstractMessageConsumer.java
        AbstractMessageProducer.java
        AbstractMessageService.java
        GroupMessage.java
        GroupMessageClient.java
        GroupMessageProducer.java
        MemberMessageClient.java
        MemberMessageConsumer.java
        MemberMessageProducer.java
        MemberMessageService.java
        MessageConsumerRegistry.java
        MessageConsumerService.java
        MessageProducerRegistry.java
        MessageProducerService.java
        package-info.java
    - test
      - java
        io
        atomix
        group
        DistributedGroupTest.java
- manager
  - src
    - main
      - java
        io
        atomix
        manager
        AtomixException.java
        ResourceClient.java
        ResourceManager.java
        ResourceManagerException.java
        ResourceServer.java
        internal
        CloseResource.java
        DeleteResource.java
        GetResource.java
        GetResourceIfExists.java
        GetResourceKeys.java
        KeyOperation.java
        ManagedResourceSession.java
        ResourceExists.java
        ResourceManagerCommit.java
        ResourceManagerCommitPool.java
        ResourceManagerSessions.java
        ResourceManagerState.java
        ResourceManagerStateMachineContext.java
        ResourceManagerStateMachineExecutor.java
        options
        AtomixOptions.java
        ClientOptions.java
        ServerOptions.java
        package-info.java
        resource
        internal
        InstanceClient.java
        InstanceCommand.java
        InstanceEvent.java
        InstanceOperation.java
        InstanceQuery.java
        InstanceSession.java
        InstanceTypeResolver.java
        ResourceInstance.java
        util
        ResourceManagerTypeResolver.java
    - test
      - java
        io
        atomix
        manager
        options
        ClientOptionsTest.java
        ServerOptionsTest.java
- resource
  - src
    - main
      - java
        io
        atomix
        resource
        AbstractResource.java
        ReadConsistency.java
        Resource.java
        ResourceException.java
        ResourceFactory.java
        ResourceRegistry.java
        ResourceStateMachine.java
        ResourceStateMachineExecutor.java
        ResourceType.java
        ResourceTypeInfo.java
        internal
        ResourceCommand.java
        ResourceCommit.java
        ResourceCopycatClient.java
        ResourceEvent.java
        ResourceOperation.java
        ResourceQuery.java
        package-info.java
- standalone
  - standalone-server
    - src
      - main
        java
        io
        atomix
        standalone
        server
        StandaloneServer.java
- testing
  - src
    - main
      - java
        io
        atomix
        testing
        AbstractCopycatTest.java
        TestCaseLogger.java
- variables
  - src
    - main
      - java
        io
        atomix
        variables
        AbstractDistributedValue.java
        DistributedLong.java
        DistributedValue.java
        internal
        AbstractValueState.java
        LongCommands.java
        LongState.java
        ValueCommands.java
        ValueState.java
        package-info.java
        util
        DistributedLongFactory.java
        DistributedValueFactory.java
    - test
      - java
        io
        atomix
        variables
        DistributedLongTest.java
        DistributedValueTest.java

/*
 * Copyright 2016 the original author or authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License
 */
package io.atomix.cluster;

import io.atomix.AtomixReplica;
import io.atomix.catalyst.annotations.Experimental;
import io.atomix.catalyst.util.Assert;
import io.atomix.copycat.error.ConfigurationException;
import io.atomix.copycat.server.cluster.Cluster;
import io.atomix.copycat.server.cluster.Member;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.Collection;
import java.util.Optional;
import java.util.concurrent.CompletableFuture;
import java.util.function.BiConsumer;
import java.util.function.Function;
import java.util.stream.Collectors;

/**
 * Cluster manager implementation that automatically balances the cluster.
 *
 * @author <a href="http://github.com/kuujo>Jordan Halterman</a>
 */
@Experimental
public class BalancingClusterManager implements ClusterManager {

  /**
   * Returns a new balancing cluster manager builder.
   *
   * @return A new balancing cluster manager builder.
   */
  public static Builder builder() {
    return new Builder();
  }

  private static final Logger LOGGER = LoggerFactory.getLogger(BalancingClusterManager.class);
  private final int quorumHint;
  private final int backupCount;
  private boolean closed;

  public BalancingClusterManager(int quorumHint, int backupCount) {
    this.quorumHint = quorumHint;
    this.backupCount = backupCount;
  }

  @Override
  public CompletableFuture<Void> start(Cluster cluster, AtomixReplica replica) {
    cluster.members().forEach(m -> {
      m.onTypeChange(t -> balance(cluster));
      m.onStatusChange(s -> balance(cluster));
    });

    cluster.onLeaderElection(l -> balance(cluster));

    cluster.onJoin(m -> {
      m.onTypeChange(t -> balance(cluster));
      m.onStatusChange(s -> balance(cluster));
      balance(cluster);
    });

    cluster.onLeave(m -> balance(cluster));
    return null;
  }

  /**
   * Balances the given cluster.
   *
   * @param cluster The cluster to rebalance.
   * @return A completable future to be completed once the cluster has been balanced.
   */
  public CompletableFuture<Void> balance(Cluster cluster) {
    if (cluster.member().equals(cluster.leader())) {
      LOGGER.info("Balancing cluster...");
      return balance(cluster, new CompletableFuture<>());
    }
    return CompletableFuture.completedFuture(null);
  }

  /**
   * Balances the cluster, recursively promoting/demoting members as necessary.
   */
  private CompletableFuture<Void> balance(Cluster cluster, CompletableFuture<Void> future) {
    if (closed) {
      future.completeExceptionally(new IllegalStateException("balancer closed"));
      return future;
    }

    Collection<Member> members = cluster.members();
    Member member = cluster.member();

    Collection<Member> active = members.stream().filter(m -> m.type() == Member.Type.ACTIVE).collect(Collectors.toList());
    Collection<Member> passive = members.stream().filter(m -> m.type() == Member.Type.PASSIVE).collect(Collectors.toList());
    Collection<Member> reserve = members.stream().filter(m -> m.type() == Member.Type.RESERVE).collect(Collectors.toList());

    int totalActiveCount = active.size();
    int totalPassiveCount = passive.size();

    long availableActiveCount = active.stream().filter(m -> m.status() == Member.Status.AVAILABLE).count();
    long availablePassiveCount = passive.stream().filter(m -> m.status() == Member.Status.AVAILABLE).count();
    long availableReserveCount = reserve.stream().filter(m -> m.status() == Member.Status.AVAILABLE).count();

    BiConsumer<Void, Throwable> completeFunction = (result, error) -> {
      if (error == null || error.getCause() instanceof ConfigurationException) {
        balance(cluster, future);
      } else {
        future.completeExceptionally(error);
      }
    };

    // If the number of available active members is less than the quorum hint, promote a passive or reserve member.
    if (quorumHint == Quorum.ALL.size() || availableActiveCount < quorumHint) {
      // If a passive member is available, promote it.
      if (availablePassiveCount > 0) {
        Member promote = passive.stream().filter(m -> m.status() == Member.Status.AVAILABLE).findFirst().get();
        LOGGER.info("Promoting {} to ACTIVE: not enough active members", promote.address());
        promote.promote(Member.Type.ACTIVE).whenComplete(completeFunction);
        return future;
      }
      // If a reserve member is available, promote it.
      else if (availableReserveCount > 0) {
        Member promote = reserve.stream().filter(m -> m.status() == Member.Status.AVAILABLE).findFirst().get();
        LOGGER.info("Promoting {} to ACTIVE: not enough active members", promote.address());
        promote.promote(Member.Type.ACTIVE).whenComplete(completeFunction);
        return future;
      }
    }

    // If the total number of active members is greater than the quorum hint, demote an active member.
    // Preferably, we want to demote a member that is unavailable.
    if (quorumHint != Quorum.ALL.size() && totalActiveCount > quorumHint) {
      // If the number of available passive members is less than the required number, demote an active
      // member to passive.
      if (availablePassiveCount < (quorumHint - 1) * backupCount) {
        Member demote = active.stream().filter(m -> m.status() == Member.Status.UNAVAILABLE).findFirst()
          .orElseGet(() -> active.stream().filter(m -> !m.equals(member)).findAny().get());
        LOGGER.info("Demoting {} to PASSIVE: too many active members", demote.address());
        demote.demote(Member.Type.PASSIVE).whenComplete(completeFunction);
        return future;
      }
      // Otherwise, demote an active member to reserve.
      else {
        Member demote = active.stream().filter(m -> m.status() == Member.Status.UNAVAILABLE).findAny()
          .orElseGet(() -> active.stream().filter(m -> !m.equals(member)).findAny().get());
        LOGGER.info("Demoting {} to RESERVE: too many active members", demote.address());
        demote.demote(Member.Type.RESERVE).whenComplete(completeFunction);
        return future;
      }
    }

    // If the number of available passive members is less than the required number of passive members,
    // promote a reserve member.
    if (quorumHint != Quorum.ALL.size() && availablePassiveCount < (quorumHint - 1) * backupCount) {
      // If any reserve members are available, promote to passive.
      if (availableReserveCount > 0) {
        Member promote = reserve.stream().filter(m -> m.status() == Member.Status.AVAILABLE).findFirst().get();
        LOGGER.info("Promoting {} to PASSIVE: not enough passive members", promote.address());
        promote.promote(Member.Type.PASSIVE).whenComplete(completeFunction);
        return future;
      }
    }

    // If the total number of passive members is greater than the required number of passive members,
    // demote a passive member. Preferably we demote an unavailable member.
    if (quorumHint != Quorum.ALL.size() && totalPassiveCount > (quorumHint - 1) * backupCount) {
      Member demote = passive.stream().filter(m -> m.status() == Member.Status.UNAVAILABLE).findAny()
        .orElseGet(() -> passive.stream().findAny().get());
      LOGGER.info("Demoting {} to RESERVE: too many passive members", demote.address());
      demote.demote(Member.Type.RESERVE).whenComplete(completeFunction);
      return future;
    }

    // If we've made it this far then the cluster is balanced.
    future.complete(null);
    return future;
  }

  @Override
  public CompletableFuture<Void> stop(Cluster cluster, AtomixReplica replica) {
    LOGGER.debug("Balancing cluster...");
    return replace(cluster, new CompletableFuture<>()).whenComplete((result, error) -> closed = true);
  }

  /**
   * Replaces the local member in the cluster.
   */
  private CompletableFuture<Void> replace(Cluster cluster, CompletableFuture<Void> future) {
    if (closed) {
      future.completeExceptionally(new IllegalStateException("cluster balancer closed"));
      return future;
    }

    BiConsumer<Void, Throwable> completeFunction = (result, error) -> {
      if (error == null) {
        future.complete(null);
      } else if (error.getCause() instanceof ConfigurationException) {
        replace(cluster, future);
      } else {
        future.completeExceptionally(error);
      }
    };

    Function<Void, CompletableFuture<Void>> demoteFunction = v -> {
      long passiveCount = cluster.members().stream().filter(m -> m.type() == Member.Type.PASSIVE).count();
      if (passiveCount < (quorumHint - 1) * backupCount) {
        LOGGER.info("Demoting {} to PASSIVE", cluster.member().address());
        return cluster.member().demote(Member.Type.PASSIVE);
      } else {
        LOGGER.info("Demoting {} to RESERVE", cluster.member().address());
        return cluster.member().demote(Member.Type.RESERVE);
      }
    };

    // If the quorum hint is ALL, don't replace the replica.
    if (quorumHint == Quorum.ALL.size()) {
      return CompletableFuture.completedFuture(null);
    }

    // If the local member is active, replace it with a passive or reserve member.
    if (cluster.member().type() == Member.Type.ACTIVE) {
      // Get a list of passive members.
      Collection<Member> passive = cluster.members().stream()
        .filter(m -> m.type() == Member.Type.PASSIVE)
        .collect(Collectors.toList());

      // Get a list of reserve members.
      Collection<Member> reserve = cluster.members().stream()
        .filter(m -> m.type() == Member.Type.RESERVE)
        .collect(Collectors.toList());

      // Attempt to promote an available passive member.
      if (!passive.isEmpty()) {
        Optional<Member> optionalMember = passive.stream().filter(m -> m.status() == Member.Status.AVAILABLE).findFirst();
        if (optionalMember.isPresent()) {
          LOGGER.info("Promoting {} to ACTIVE: replacing {}", optionalMember.get().address(), cluster.member().address());
          optionalMember.get().promote(Member.Type.ACTIVE)
            .thenCompose(demoteFunction)
            .whenComplete(completeFunction);
          return future;
        }
      }

      // Attempt to promote an available reserve member.
      if (!reserve.isEmpty()) {
        Optional<Member> optionalMember = reserve.stream().filter(m -> m.status() == Member.Status.AVAILABLE).findFirst();
        if (optionalMember.isPresent()) {
          LOGGER.info("Promoting {} to ACTIVE: replacing {}", optionalMember.get().address(), cluster.member().address());
          optionalMember.get().promote(Member.Type.ACTIVE)
            .thenCompose(demoteFunction)
            .whenComplete(completeFunction);
          return future;
        }
      }

      // Promote an unavailable passive or reserve member.
      if (!passive.isEmpty()) {
        Member member = passive.iterator().next();
        LOGGER.info("Promoting {} to ACTIVE: replacing {}", member.address(), cluster.member().address());
        member.promote(Member.Type.ACTIVE)
          .thenCompose(demoteFunction)
          .whenComplete(completeFunction);
      } else if (!reserve.isEmpty()) {
        Member member = reserve.iterator().next();
        LOGGER.info("Promoting {} to ACTIVE: replacing {}", member.address(), cluster.member().address());
        member.promote(Member.Type.ACTIVE)
          .thenCompose(demoteFunction)
          .whenComplete(completeFunction);
      } else {
        future.complete(null);
      }
    } else if (cluster.member().type() == Member.Type.PASSIVE) {
      Collection<Member> reserve = cluster.members().stream()
        .filter(m -> m.type() == Member.Type.RESERVE)
        .collect(Collectors.toList());

      if (!reserve.isEmpty()) {
        Optional<Member> optionalMember = reserve.stream().filter(m -> m.status() == Member.Status.AVAILABLE).findFirst();
        if (optionalMember.isPresent()) {
          LOGGER.info("Promoting {} to PASSIVE: replacing {}", optionalMember.get().address(), cluster.member().address());
          optionalMember.get().promote(Member.Type.PASSIVE)
            .thenCompose(demoteFunction)
            .whenComplete(completeFunction);
        } else {
          Member member = reserve.iterator().next();
          LOGGER.info("Promoting {} to PASSIVE: replacing {}", member.address(), cluster.member().address());
          member.promote(Member.Type.PASSIVE)
            .thenCompose(demoteFunction)
            .whenComplete(completeFunction);
        }
      } else {
        future.complete(null);
      }
    } else {
      future.complete(null);
    }
    return future;
  }

  /**
   * Balancing cluster manager builder.
   */
  public static class Builder implements ClusterManager.Builder {
    private int quorumHint = Quorum.ALL.size();
    private int backupCount = 0;

    /**
     * Sets the cluster quorum hint.
     * <p>
     * The quorum hint specifies the optimal number of replicas to actively participate in the Raft
     * consensus algorithm. As long as there are at least {@code quorumHint} replicas in the cluster,
     * Atomix will automatically balance replicas to ensure that at least {@code quorumHint} replicas
     * are active participants in the Raft algorithm at any given time. Replicas can be added to or
     * removed from the cluster at will, and remaining replicas will be transparently promoted and demoted
     * as necessary to maintain the desired quorum size.
     * <p>
     * The size of the quorum is relevant both to performance and fault-tolerance. When resources are
     * created or deleted or resource state changes are submitted to the cluster, Atomix will synchronously
     * replicate changes to a majority of the cluster before they can be committed and update state. For
     * example, in a cluster where the {@code quorumHint} is {@code 3}, a
     * {@link io.atomix.collections.DistributedMap#put(Object, Object)} command must be sent to the leader
     * and then synchronously replicated to one other replica before it can be committed and applied to the
     * map state machine. This also means that a cluster with {@code quorumHint} equal to {@code 3} can tolerate
     * at most one failure.
     * <p>
     * Users should set the {@code quorumHint} to an odd number of replicas or use one of the {@link Quorum}
     * magic constants for the greatest level of fault tolerance. Typically, in write-heavy workloads, the most
     * performant configuration will be a {@code quorumHint} of {@code 3}. In read-heavy workloads, quorum hints
     * of {@code 3} or {@code 5} can be used depending on the size of the cluster and desired level of fault tolerance.
     * Additional active replicas may or may not improve read performance depending on usage and in particular
     * {@link io.atomix.resource.ReadConsistency read consistency} levels.
     *
     * @param quorumHint The quorum hint. This must be the same on all replicas in the cluster.
     * @return The replica builder.
     * @throws IllegalArgumentException if the quorum hint is less than {@code -1}
     */
    public Builder withQuorumHint(int quorumHint) {
      this.quorumHint = Assert.argNot(quorumHint, quorumHint < -1, "quorumHint must be positive or -1");
      return this;
    }

    /**
     * Sets the cluster quorum hint.
     * <p>
     * The quorum hint specifies the optimal number of replicas to actively participate in the Raft
     * consensus algorithm. As long as there are at least {@code quorumHint} replicas in the cluster,
     * Atomix will automatically balance replicas to ensure that at least {@code quorumHint} replicas
     * are active participants in the Raft algorithm at any given time. Replicas can be added to or
     * removed from the cluster at will, and remaining replicas will be transparently promoted and demoted
     * as necessary to maintain the desired quorum size.
     * <p>
     * By default, the configured quorum hint is {@link Quorum#ALL}.
     * <p>
     * The size of the quorum is relevant both to performance and fault-tolerance. When resources are
     * created or deleted or resource state changes are submitted to the cluster, Atomix will synchronously
     * replicate changes to a majority of the cluster before they can be committed and update state. For
     * example, in a cluster where the {@code quorumHint} is {@code 3}, a
     * {@link io.atomix.collections.DistributedMap#put(Object, Object)} command must be sent to the leader
     * and then synchronously replicated to one other replica before it can be committed and applied to the
     * map state machine. This also means that a cluster with {@code quorumHint} equal to {@code 3} can tolerate
     * at most one failure.
     * <p>
     * Users should set the {@code quorumHint} to an odd number of replicas or use one of the {@link Quorum}
     * magic constants for the greatest level of fault tolerance. Typically, in write-heavy workloads, the most
     * performant configuration will be a {@code quorumHint} of {@code 3}. In read-heavy workloads, quorum hints
     * of {@code 3} or {@code 5} can be used depending on the size of the cluster and desired level of fault tolerance.
     * Additional active replicas may or may not improve read performance depending on usage and in particular
     * {@link io.atomix.resource.ReadConsistency read consistency} levels.
     *
     * @param quorum The quorum hint. This must be the same on all replicas in the cluster.
     * @return The replica builder.
     * @throws NullPointerException if the quorum hint is null
     */
    public Builder withQuorumHint(Quorum quorum) {
      this.quorumHint = Assert.notNull(quorum, "quorum").size();
      return this;
    }

    /**
     * Sets the replica backup count.
     * <p>
     * The backup count specifies the maximum number of replicas per {@link #withQuorumHint(int) active} replica
     * to participate in asynchronous replication of state. Backup replicas allow quorum-member replicas to be
     * more quickly replaced in the event of a failure or an active replica leaving the cluster. Additionally,
     * backup replicas may service {@link io.atomix.resource.ReadConsistency#SEQUENTIAL SEQUENTIAL} and
     * {@link io.atomix.resource.ReadConsistency#LOCAL LOCAL} reads to allow read operations to be further
     * spread across the cluster.
     * <p>
     * The backup count is used to calculate the number of backup replicas per non-leader active member. The
     * number of actual backups is calculated by {@code (quorumHint - 1) * backupCount}. If the
     * {@code backupCount} is {@code 1} and the {@code quorumHint} is {@code 3}, the number of backup replicas
     * will be {@code 2}.
     * <p>
     * By default, the backup count is {@code 0}, indicating no backups should be maintained. However, it is
     * recommended that each cluster have at least a backup count of {@code 1} to ensure active replicas can
     * be quickly replaced in the event of a network partition or other failure. Quick replacement of active
     * member nodes improves fault tolerance in cases where a majority of the active members in the cluster are
     * not lost simultaneously.
     *
     * @param backupCount The number of backup replicas per active replica. This must be the same on all
     *                    replicas in the cluster.
     * @return The replica builder.
     * @throws IllegalArgumentException if the {@code backupCount} is negative
     */
    public Builder withBackupCount(int backupCount) {
      this.backupCount = Assert.argNot(backupCount, backupCount < 0, "backupCount must be positive");
      return this;
    }

    @Override
    public ClusterManager build() {
      return new BalancingClusterManager(quorumHint, backupCount);
    }
  }

}