package io.scalecube.cluster.membership;
import static io.scalecube.cluster.membership.MemberStatus.ALIVE;
import static io.scalecube.cluster.membership.MemberStatus.DEAD;
import io.scalecube.cluster.ClusterMath;
import io.scalecube.cluster.Member;
import io.scalecube.cluster.fdetector.FailureDetectorEvent;
import io.scalecube.cluster.fdetector.IFailureDetector;
import io.scalecube.cluster.gossip.IGossipProtocol;
import io.scalecube.transport.Address;
import io.scalecube.transport.Transport;
import io.scalecube.transport.Message;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import rx.Observable;
import rx.Scheduler;
import rx.Subscriber;
import rx.observers.Subscribers;
import rx.schedulers.Schedulers;
import rx.subjects.PublishSubject;
import rx.subjects.Subject;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.ThreadLocalRandom;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.atomic.AtomicReference;
public final class MembershipProtocol implements IMembershipProtocol {
private static final Logger LOGGER = LoggerFactory.getLogger(MembershipProtocol.class);
private enum MembershipUpdateReason {
FAILURE_DETECTOR_EVENT,
MEMBERSHIP_GOSSIP,
SYNC,
INITIAL_SYNC,
SUSPICION_TIMEOUT
}
// Qualifiers
public static final String SYNC = "sc/membership/sync";
public static final String SYNC_ACK = "sc/membership/syncAck";
public static final String MEMBERSHIP_GOSSIP = "sc/membership/gossip";
// Injected
private final AtomicReference<Member> memberRef;
private final Transport transport;
private final MembershipConfig config;
private final List<Address> seedMembers;
private IFailureDetector failureDetector;
private IGossipProtocol gossipProtocol;
// State
private final Map<String, MembershipRecord> membershipTable = new HashMap<>();
// Subject
private final Subject<MembershipEvent, MembershipEvent> subject =
PublishSubject.<MembershipEvent>create().toSerialized();
// Subscriptions
private Subscriber<Message> onSyncRequestSubscriber;
private Subscriber<Message> onSyncAckResponseSubscriber;
private Subscriber<FailureDetectorEvent> onFdEventSubscriber;
private Subscriber<Message> onGossipRequestSubscriber;
// Scheduled
private final Scheduler scheduler;
private final ScheduledExecutorService executor;
private final Map<String, ScheduledFuture<?>> suspicionTimeoutTasks = new HashMap<>();
private ScheduledFuture<?> syncTask;
/**
* Creates new instantiates of cluster membership protocol with given transport and config.
*
* @param transport transport
* @param config membership config parameters
*/
public MembershipProtocol(Transport transport, MembershipConfig config) {
this.transport = transport;
this.config = config;
Member member = new Member(IdGenerator.generateId(), transport.address(), config.getMetadata());
this.memberRef = new AtomicReference<>(member);
String nameFormat = "sc-membership-" + Integer.toString(transport.address().port());
this.executor = Executors.newSingleThreadScheduledExecutor(
new ThreadFactoryBuilder().setNameFormat(nameFormat).setDaemon(true).build());
this.scheduler = Schedulers.from(executor);
this.seedMembers = cleanUpSeedMembers(config.getSeedMembers());
}
// Remove duplicates and local address
private List<Address> cleanUpSeedMembers(Collection<Address> seedMembers) {
Set<Address> seedMembersSet = new HashSet<>(seedMembers); // remove duplicates
seedMembersSet.remove(transport.address()); // remove local address
return Collections.unmodifiableList(new ArrayList<>(seedMembersSet));
}
public void setFailureDetector(IFailureDetector failureDetector) {
this.failureDetector = failureDetector;
}
public void setGossipProtocol(IGossipProtocol gossipProtocol) {
this.gossipProtocol = gossipProtocol;
}
/**
* <b>NOTE:</b> this method is for testing purpose only.
*/
IFailureDetector getFailureDetector() {
return failureDetector;
}
/**
* <b>NOTE:</b> this method is for testing purpose only.
*/
IGossipProtocol getGossipProtocol() {
return gossipProtocol;
}
/**
* <b>NOTE:</b> this method is for testing purpose only.
*/
Transport getTransport() {
return transport;
}
/**
* <b>NOTE:</b> this method is for testing purpose only.
*/
List<MembershipRecord> getMembershipRecords() {
return ImmutableList.copyOf(membershipTable.values());
}
@Override
public Observable<MembershipEvent> listen() {
return subject.onBackpressureBuffer().asObservable();
}
@Override
public Member member() {
return memberRef.get();
}
@Override
public void updateMetadata(Map<String, String> metadata) {
executor.execute(() -> onUpdateMetadata(metadata));
}
@Override
public void updateMetadataProperty(String key, String value) {
executor.execute(() -> onUpdateMetadataProperty(key, value));
}
/**
* Starts running cluster membership protocol. After started it begins to receive and send cluster membership messages
*/
public CompletableFuture<Void> start() {
// Init membership table with local member record
Member member = memberRef.get();
MembershipRecord localMemberRecord = new MembershipRecord(member, ALIVE, 0);
membershipTable.put(member.id(), localMemberRecord);
// Listen to incoming SYNC requests from other members
onSyncRequestSubscriber = Subscribers.create(this::onSync, this::onError);
transport.listen().observeOn(scheduler)
.filter(msg -> SYNC.equals(msg.qualifier()))
.filter(this::checkSyncGroup)
.subscribe(onSyncRequestSubscriber);
// Listen to incoming SYNC ACK responses from other members
onSyncAckResponseSubscriber = Subscribers.create(this::onSyncAck, this::onError);
transport.listen().observeOn(scheduler)
.filter(msg -> SYNC_ACK.equals(msg.qualifier()))
.filter(msg -> msg.correlationId() == null) // filter out initial sync
.filter(this::checkSyncGroup)
.subscribe(onSyncAckResponseSubscriber);
// Listen to events from failure detector
onFdEventSubscriber = Subscribers.create(this::onFailureDetectorEvent, this::onError);
failureDetector.listen().observeOn(scheduler)
.subscribe(onFdEventSubscriber);
// Listen to membership gossips
onGossipRequestSubscriber = Subscribers.create(this::onMembershipGossip, this::onError);
gossipProtocol.listen().observeOn(scheduler)
.filter(msg -> MEMBERSHIP_GOSSIP.equals(msg.qualifier()))
.subscribe(onGossipRequestSubscriber);
// Make initial sync with all seed members
return doInitialSync();
}
private void onError(Throwable throwable) {
LOGGER.error("Received unexpected error: ", throwable);
}
/**
* Stops running cluster membership protocol and releases occupied resources.
*/
public void stop() {
// Stop accepting requests and events
if (onSyncRequestSubscriber != null) {
onSyncRequestSubscriber.unsubscribe();
}
if (onFdEventSubscriber != null) {
onFdEventSubscriber.unsubscribe();
}
if (onGossipRequestSubscriber != null) {
onGossipRequestSubscriber.unsubscribe();
}
if (onSyncAckResponseSubscriber != null) {
onSyncAckResponseSubscriber.unsubscribe();
}
// Stop sending sync
if (syncTask != null) {
syncTask.cancel(true);
}
// Cancel remove members tasks
for (String memberId : suspicionTimeoutTasks.keySet()) {
ScheduledFuture<?> future = suspicionTimeoutTasks.get(memberId);
if (future != null) {
future.cancel(true);
}
}
suspicionTimeoutTasks.clear();
// Shutdown executor
executor.shutdown();
// Stop publishing events
subject.onCompleted();
}
// ================================================
// ============== Action Methods ==================
// ================================================
private CompletableFuture<Void> doInitialSync() {
LOGGER.debug("Making initial Sync to all seed members: {}", seedMembers);
if (seedMembers.isEmpty()) {
schedulePeriodicSync();
return CompletableFuture.completedFuture(null);
}
CompletableFuture<Void> syncResponseFuture = new CompletableFuture<>();
// Listen initial Sync Ack
String cid = memberRef.get().id();
transport.listen().observeOn(scheduler)
.filter(msg -> SYNC_ACK.equals(msg.qualifier()))
.filter(msg -> cid.equals(msg.correlationId()))
.filter(this::checkSyncGroup)
.take(1)
.timeout(config.getSyncTimeout(), TimeUnit.MILLISECONDS, scheduler)
.subscribe(
message -> {
SyncData syncData = message.data();
LOGGER.info("Joined cluster '{}': {}", syncData.getSyncGroup(), syncData.getMembership());
onSyncAck(message, true);
schedulePeriodicSync();
syncResponseFuture.complete(null);
},
throwable -> {
LOGGER.info("Timeout getting initial SyncAck from seed members: {}", seedMembers);
schedulePeriodicSync();
syncResponseFuture.complete(null);
});
Message syncMsg = prepareSyncDataMsg(SYNC, cid);
seedMembers.forEach(address -> transport.send(address, syncMsg));
return syncResponseFuture;
}
private void doSync() {
try {
Address syncMember = selectSyncAddress();
if (syncMember == null) {
return;
}
Message syncMsg = prepareSyncDataMsg(SYNC, null);
transport.send(syncMember, syncMsg);
LOGGER.debug("Send Sync to {}: {}", syncMember, syncMsg);
} catch (Exception cause) {
LOGGER.error("Unhandled exception: {}", cause, cause);
}
}
// ================================================
// ============== Event Listeners =================
// ================================================
private void onUpdateMetadataProperty(String key, String value) {
// Update local member reference
Member curMember = memberRef.get();
Map<String, String> metadata = new HashMap<>(curMember.metadata());
metadata.put(key, value);
onUpdateMetadata(metadata);
}
private void onUpdateMetadata(Map<String, String> metadata) {
// Update local member reference
Member curMember = memberRef.get();
String memberId = curMember.id();
Member newMember = new Member(memberId, curMember.address(), metadata);
memberRef.set(newMember);
// Update membership table
MembershipRecord curRecord = membershipTable.get(memberId);
MembershipRecord newRecord = new MembershipRecord(newMember, ALIVE, curRecord.incarnation() + 1);
membershipTable.put(memberId, newRecord);
// Emit membership updated event
subject.onNext(MembershipEvent.createUpdated(curMember, newMember));
// Spread new membership record over the cluster
spreadMembershipGossip(newRecord);
}
private void onSyncAck(Message syncAckMsg) {
onSyncAck(syncAckMsg, false);
}
private void onSyncAck(Message syncAckMsg, boolean initial) {
LOGGER.debug("Received SyncAck: {}", syncAckMsg);
syncMembership(syncAckMsg.data(), initial);
}
/**
* Merges incoming SYNC data, merges it and sending back merged data with SYNC_ACK.
*/
private void onSync(Message syncMsg) {
LOGGER.debug("Received Sync: {}", syncMsg);
syncMembership(syncMsg.data(), false);
Message syncAckMsg = prepareSyncDataMsg(SYNC_ACK, syncMsg.correlationId());
transport.send(syncMsg.sender(), syncAckMsg);
}
/**
* Merges FD updates and processes them.
*/
private void onFailureDetectorEvent(FailureDetectorEvent fdEvent) {
MembershipRecord r0 = membershipTable.get(fdEvent.member().id());
if (r0 == null) { // member already removed
return;
}
if (r0.status() == fdEvent.status()) { // status not changed
return;
}
LOGGER.debug("Received status change on failure detector event: {}", fdEvent);
if (fdEvent.status() == ALIVE) {
// TODO: Consider to make more elegant solution
// Alive won't override SUSPECT so issue instead extra sync with member to force it spread alive with inc + 1
Message syncMsg = prepareSyncDataMsg(SYNC, null);
transport.send(fdEvent.member().address(), syncMsg);
} else {
MembershipRecord r1 = new MembershipRecord(r0.member(), fdEvent.status(), r0.incarnation());
updateMembership(r1, MembershipUpdateReason.FAILURE_DETECTOR_EVENT);
}
}
/**
* Merges received membership gossip (not spreading gossip further).
*/
private void onMembershipGossip(Message message) {
MembershipRecord record = message.data();
LOGGER.debug("Received membership gossip: {}", record);
updateMembership(record, MembershipUpdateReason.MEMBERSHIP_GOSSIP);
}
// ================================================
// ============== Helper Methods ==================
// ================================================
private Address selectSyncAddress() {
// TODO [AK]: During running phase it should send to both seed or not seed members (issue #38)
return !seedMembers.isEmpty() ? seedMembers.get(ThreadLocalRandom.current().nextInt(seedMembers.size())) : null;
}
private boolean checkSyncGroup(Message message) {
SyncData data = message.data();
return config.getSyncGroup().equals(data.getSyncGroup());
}
private void schedulePeriodicSync() {
int syncInterval = config.getSyncInterval();
syncTask = executor.scheduleWithFixedDelay(this::doSync, syncInterval, syncInterval, TimeUnit.MILLISECONDS);
}
private Message prepareSyncDataMsg(String qualifier, String cid) {
List<MembershipRecord> membershipRecords = new ArrayList<>(membershipTable.values());
SyncData syncData = new SyncData(membershipRecords, config.getSyncGroup());
return Message.withData(syncData).qualifier(qualifier).correlationId(cid).build();
}
private void syncMembership(SyncData syncData, boolean initial) {
for (MembershipRecord r1 : syncData.getMembership()) {
MembershipRecord r0 = membershipTable.get(r1.id());
if (!r1.equals(r0)) {
MembershipUpdateReason reason = initial ? MembershipUpdateReason.INITIAL_SYNC : MembershipUpdateReason.SYNC;
updateMembership(r1, reason);
}
}
}
/**
* Try to update membership table with the given record.
*
* @param r1 new membership record which compares with existing r0 record
* @param reason indicating the reason for updating membership table
*/
private void updateMembership(MembershipRecord r1, MembershipUpdateReason reason) {
Preconditions.checkArgument(r1 != null, "Membership record can't be null");
// Get current record
MembershipRecord r0 = membershipTable.get(r1.id());
// Check if new record r1 overrides existing membership record r0
if (!r1.isOverrides(r0)) {
return;
}
// If received updated for local member then increase incarnation number and spread Alive gossip
Member localMember = memberRef.get();
if (r1.member().id().equals(localMember.id())) {
int currentIncarnation = Math.max(r0.incarnation(), r1.incarnation());
MembershipRecord r2 = new MembershipRecord(localMember, ALIVE, currentIncarnation + 1);
membershipTable.put(localMember.id(), r2);
LOGGER.debug("Local membership record r0={}, but received r1={}, spread r2={}", r0, r1, r2);
spreadMembershipGossip(r2);
return;
}
// Update membership
if (r1.isDead()) {
membershipTable.remove(r1.id());
} else {
membershipTable.put(r1.id(), r1);
}
// Schedule/cancel suspicion timeout task
if (r1.isSuspect()) {
scheduleSuspicionTimeoutTask(r1);
} else {
cancelSuspicionTimeoutTask(r1.id());
}
// Emit membership event
if (r1.isDead()) {
subject.onNext(MembershipEvent.createRemoved(r1.member()));
} else if (r0 == null && r1.isAlive()) {
subject.onNext(MembershipEvent.createAdded(r1.member()));
} else if (r0 != null && !r0.member().equals(r1.member())) {
subject.onNext(MembershipEvent.createUpdated(r0.member(), r1.member()));
}
// Spread gossip (unless already gossiped)
if (reason != MembershipUpdateReason.MEMBERSHIP_GOSSIP && reason != MembershipUpdateReason.INITIAL_SYNC) {
spreadMembershipGossip(r1);
}
}
private void cancelSuspicionTimeoutTask(String memberId) {
ScheduledFuture<?> future = suspicionTimeoutTasks.remove(memberId);
if (future != null) {
future.cancel(true);
}
}
private void scheduleSuspicionTimeoutTask(MembershipRecord record) {
long suspicionTimeout =
ClusterMath.suspicionTimeout(config.getSuspicionMult(), membershipTable.size(), config.getPingInterval());
suspicionTimeoutTasks.computeIfAbsent(record.id(), id ->
executor.schedule(() -> onSuspicionTimeout(id), suspicionTimeout, TimeUnit.MILLISECONDS));
}
private void onSuspicionTimeout(String memberId) {
suspicionTimeoutTasks.remove(memberId);
MembershipRecord record = membershipTable.get(memberId);
if (record != null) {
LOGGER.debug("Declare SUSPECTED member as DEAD by timeout: {}", record);
MembershipRecord deadRecord = new MembershipRecord(record.member(), DEAD, record.incarnation());
updateMembership(deadRecord, MembershipUpdateReason.SUSPICION_TIMEOUT);
}
}
private void spreadMembershipGossip(MembershipRecord record) {
Message membershipMsg = Message.withData(record).qualifier(MEMBERSHIP_GOSSIP).build();
gossipProtocol.spread(membershipMsg);
}
}