/*
* Copyright (c) 2015 Spotify AB.
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.spotify.heroic.cluster;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterables;
import com.spotify.heroic.HeroicConfiguration;
import com.spotify.heroic.HeroicContext;
import com.spotify.heroic.lifecycle.LifeCycleRegistry;
import com.spotify.heroic.lifecycle.LifeCycles;
import com.spotify.heroic.metric.QueryTrace;
import com.spotify.heroic.scheduler.Scheduler;
import com.spotify.heroic.statistics.QueryReporter;
import eu.toolchain.async.AsyncFramework;
import eu.toolchain.async.AsyncFuture;
import eu.toolchain.async.LazyTransform;
import eu.toolchain.async.Transform;
import java.net.URI;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.CancellationException;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.Consumer;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import javax.inject.Inject;
import javax.inject.Named;
import lombok.Data;
import lombok.ToString;
import lombok.extern.slf4j.Slf4j;
/**
* Handles management of cluster state.
* <p>
* The primary responsibility is to receive refresh requests through {@link #refresh()} that should
* cause the cluster state to be updated.
*
* @author udoprog
*/
@ClusterScope
@Slf4j
@ToString(of = {"useLocal"})
public class CoreClusterManager implements ClusterManager, LifeCycles {
public static final QueryTrace.Identifier LOCAL_IDENTIFIER =
new QueryTrace.Identifier("[local]");
private final AsyncFramework async;
private final ClusterDiscovery discovery;
private final NodeMetadata localMetadata;
private final Map<String, RpcProtocol> protocols;
private final Scheduler scheduler;
private final Boolean useLocal;
private final HeroicConfiguration options;
private final LocalClusterNode local;
private final HeroicContext context;
private final Set<Map<String, String>> expectedTopology;
private final QueryReporter reporter;
final AtomicReference<Set<URI>> staticNodes = new AtomicReference<>(new HashSet<>());
final AtomicReference<NodeRegistry> registry = new AtomicReference<>();
final AtomicReference<Map<URI, ClusterNode>> clients =
new AtomicReference<>(Collections.emptyMap());
final AtomicLong refreshId = new AtomicLong();
final Object updateRegistryLock = new Object();
@Inject
public CoreClusterManager(
AsyncFramework async, ClusterDiscovery discovery, NodeMetadata localMetadata,
Map<String, RpcProtocol> protocols, Scheduler scheduler,
@Named("useLocal") Boolean useLocal, HeroicConfiguration options, LocalClusterNode local,
HeroicContext context, @Named("topology") Set<Map<String, String>> expectedTopology,
final QueryReporter reporter
) {
this.async = async;
this.discovery = discovery;
this.localMetadata = localMetadata;
this.protocols = protocols;
this.scheduler = scheduler;
this.useLocal = useLocal;
this.options = options;
this.local = local;
this.context = context;
this.expectedTopology = expectedTopology;
this.reporter = reporter;
}
@Override
public void register(LifeCycleRegistry registry) {
registry.start(this::start);
registry.stop(this::stop);
}
@Override
public AsyncFuture<Set<URI>> getStaticNodes() {
return async.resolved(staticNodes.get());
}
@Override
public AsyncFuture<Void> removeStaticNode(URI node) {
while (true) {
final Set<URI> old = staticNodes.get();
final Set<URI> update = new HashSet<>(staticNodes.get());
/* node already registered */
if (!update.remove(node)) {
return async.resolved();
}
if (staticNodes.compareAndSet(old, update)) {
break;
}
}
return refresh();
}
@Override
public AsyncFuture<Void> addStaticNode(URI node) {
while (true) {
final Set<URI> old = staticNodes.get();
final Set<URI> update = new HashSet<>(staticNodes.get());
/* node already registered */
if (!update.add(node)) {
return async.resolved();
}
if (staticNodes.compareAndSet(old, update)) {
break;
}
}
return refresh();
}
/**
* Eventually consistent view of the currently known nodes in the cluster
*/
@Override
public List<ClusterNode> getNodes() {
final NodeRegistry registry = this.registry.get();
if (registry == null) {
throw new IllegalStateException("Registry not ready");
}
return registry.getEntries();
}
/**
* Perform a refresh of the cluster information.
* <p>
* A refresh happens in four steps.
* <ol>
* <li>discovery</li>
* <li>sweep</li>
* <li>log and prepare</li>
* <li>finalize</li>
* </ol>
* </p>
* <p>
* The discovery phase adds a collection of URIs provided statically (by {@link #staticNodes}
* and dynamically (by {@link #discovery}) to be fed into the sweep step.
* </p>
* <p>
* The sweep step takes the existing {@link #clients} map and compares it to the updated list of
* URIs.
* </p>
* <p>
* The log and prepare step logs information about which operations happened and prepares for
* the final step.
* </p>
* <p>
* The finalize step takes the collection of new clients and node entries, replaces it
* atomically with the old collection.
* If there is a race another refresh operation will be issued.
* If there was no race, the new registry is now active and the old registry can safely be torn
* down. Any nodes that were removed or deemed faulty will now have their connections closed.
* </p>
*
* @return a future indicating the state of the refresh.
*/
@Override
public AsyncFuture<Void> refresh() {
final String id = String.format("%08x", refreshId.getAndIncrement());
log.info("new refresh with id ({})", id);
return refreshDiscovery(id);
}
@Override
public ClusterManager.Statistics getStatistics() {
final NodeRegistry registry = this.registry.get();
if (registry == null) {
return null;
}
return new ClusterManager.Statistics(registry.getOnlineNodes(), registry.getOfflineNodes());
}
/**
* Eventually consistent view of the currently known shards in the cluster
*/
@Override
public List<ClusterShard> useOptionalGroup(final Optional<String> group) {
final ImmutableList.Builder<ClusterShard> shards = ImmutableList.builder();
for (final Map<String, String> shardTags : allShards()) {
shards.add(new ClusterShard(async, shardTags, reporter, this));
}
return shards.build();
}
@Override
public <T> Optional<ClusterManager.NodeResult<T>> withNodeInShardButNotWithId(
final Map<String, String> shard, final Predicate<ClusterNode> exclude,
final Function<ClusterNode.Group, T> fn
) {
synchronized (this.updateRegistryLock) {
final Optional<ClusterNode> n =
registry.get().getNodeInShardButNotWithId(shard, exclude);
if (!n.isPresent()) {
return Optional.empty();
}
ClusterNode node = n.get();
return Optional.of(
new ClusterManager.NodeResult<T>(fn.apply(node.useDefaultGroup()), node));
}
}
@Override
public boolean hasNextButNotWithId(
final Map<String, String> shard, final Predicate<ClusterNode> exclude
) {
synchronized (this.updateRegistryLock) {
final Optional<ClusterNode> n =
registry.get().getNodeInShardButNotWithId(shard, exclude);
return n.isPresent();
}
}
/**
* Eventually consistent view of the currently known nodes in a specific shard
*/
@Override
public List<ClusterNode> getNodesForShard(Map<String, String> shard) {
synchronized (this.updateRegistryLock) {
return registry.get().getNodesInShard(shard);
}
}
@Override
public Set<RpcProtocol> protocols() {
return ImmutableSet.copyOf(protocols.values());
}
AsyncFuture<Void> start() {
final AsyncFuture<Void> startup;
if (!options.isOneshot()) {
startup = context.startedFuture().directTransform(result -> {
scheduler.periodically("cluster-refresh", 1, TimeUnit.MINUTES,
() -> refresh().get());
return null;
});
} else {
startup = context.startedFuture();
}
startup.lazyTransform(result -> refresh().catchFailed((Throwable e) -> {
log.error("initial metadata refresh failed", e);
return null;
}));
return async.resolved();
}
AsyncFuture<Void> stop() {
final Map<URI, ClusterNode> clients = this.clients.getAndSet(null);
if (clients == null) {
return async.resolved();
}
return async.collectAndDiscard(
clients.values().stream().map(ClusterNode::close).collect(Collectors.toList()));
}
Set<Map<String, String>> allShards() {
final NodeRegistry registry = this.registry.get();
if (registry == null) {
throw new IllegalStateException("Registry not ready");
}
final Set<Map<String, String>> shards = registry.getShards();
/* Actual topology (shards) is detected based on the metadata coming from the nodes.
* Expected topology is specified in the optional 'topology'. This specifies the minimum
* shards expected, i.e. additional shards may also exist. */
final List<Map<String, String>> shardsWithNoNodes =
expectedTopology.stream().filter(e -> !shards.contains(e)).collect(Collectors.toList());
if (shardsWithNoNodes.isEmpty()) {
return shards;
}
final Set<Map<String, String>> withExpected = new HashSet<>();
withExpected.addAll(shards);
for (final Map<String, String> shard : shardsWithNoNodes) {
/* For every shard that didn't have a discovered node, add an empty entry in the
* shard list. This ensures that suitable code later on will complain that there
* were shards with no available nodes/groups. */
withExpected.add(shard);
}
return withExpected;
}
Set<Map<String, String>> extractKnownShards(Set<ClusterNode> entries) {
final Set<Map<String, String>> knownShards = new HashSet<>();
for (final ClusterNode e : entries) {
knownShards.add(e.metadata().getTags());
}
return knownShards;
}
AsyncFuture<Update> createClusterNode(final String id, final URI uri) {
final RpcProtocol protocol = protocols.get(uri.getScheme());
if (protocol == null) {
return async.resolved(new FailedUpdate(uri,
new IllegalArgumentException("Unsupported protocol (" + uri.getScheme() + ")"),
Optional.empty()));
}
return protocol.connect(uri).<Update>lazyTransform(node -> {
if (useLocal && localMetadata.getId().equals(node.metadata().getId())) {
log.info("{} using local instead of {} (closing old node)", id, node);
final TracingClusterNode tracingNode = new TracingClusterNode(local,
new QueryTrace.Identifier(uri.toString() + "[local]"));
// close old node
return node
.close()
.directTransform(v -> new SuccessfulUpdate(uri, true, tracingNode));
}
return async.resolved(new SuccessfulUpdate(uri, true,
new TracingClusterNode(node, new QueryTrace.Identifier(uri.toString()))));
}).catchFailed(Update.error(uri)).catchCancelled(Update.cancellation(uri));
}
/**
* The first step of the refresh operation.
* <p>
* Discover a new collection of heroic peers, and feed them into the sweep step.
*
* @param id id of the operation
* @return a future indicating when the operation is finished
*/
AsyncFuture<Void> refreshDiscovery(final String id) {
final List<AsyncFuture<List<URI>>> dynamic = new ArrayList<>();
final List<URI> staticNodes = new ArrayList<>(this.staticNodes.get());
if (!staticNodes.isEmpty()) {
dynamic.add(async.resolved(staticNodes));
}
dynamic.add(discovery.find());
return async.collect(dynamic).lazyTransform(refreshSweep(id));
}
/**
* Operation that takes the existing list of clients, compares it to a collection of resolved
* URIs and determines which nodes should be updated, and which should be removed.
*
* @param id id of the operation
* @return a lazy transform
*/
LazyTransform<Collection<List<URI>>, Void> refreshSweep(final String id) {
return uriLists -> {
final List<URI> uris = ImmutableList.copyOf(Iterables.concat(uriLists));
final List<AsyncFuture<Update>> updated = new ArrayList<>();
final List<RemovedNode> removedNodes = new ArrayList<>();
final Map<URI, ClusterNode> oldClients = this.clients.get();
if (oldClients == null) {
log.warn("{}: Aborting refresh, shutting down", id);
return async.resolved();
}
final Set<URI> removedUris = new HashSet<>(oldClients.keySet());
for (final URI uri : uris) {
final ClusterNode node = oldClients.get(uri);
removedUris.remove(uri);
if (node == null) {
/* first time URI has been seen, resolve new node */
updated.add(createClusterNode(id, uri));
continue;
}
/* re-query metadata for nodes already known and make sure it matches.
* if it does not match, create a new cluster node and close the old one.
* otherwise, re-use the existing node */
updated.add(node.fetchMetadata().lazyTransform(m -> {
if (!node.metadata().equals(m)) {
/* add to removedNodes list to make sure it is being closed */
removedNodes.add(new RemovedNode(uri, node));
return createClusterNode(id, uri);
}
return async.resolved(new SuccessfulUpdate(uri, false, node));
}).catchFailed(Update.error(uri)).catchCancelled(Update.cancellation(uri)));
}
/* all the nodes that have not been seen in the updates list of uris have been removed
* and should be closed */
for (final URI uri : removedUris) {
final ClusterNode remove = oldClients.get(uri);
if (remove != null) {
removedNodes.add(new RemovedNode(uri, remove));
}
}
return async
.collect(updated)
.lazyTransform(refreshLogAndPrepare(id, removedNodes, oldClients));
};
}
/**
* Operation the logs all intended operations and prepares for the final step.
*
* @param id id of the refresh operation
* @param removedNodes clients which should be removed
* @param oldClients map of clients that should be replaced by a new map of clients
* @return a lazy transform
*/
LazyTransform<Collection<Update>, Void> refreshLogAndPrepare(
final String id, final List<RemovedNode> removedNodes,
final Map<URI, ClusterNode> oldClients
) {
return updates -> {
final Set<ClusterNode> okNodes = new HashSet<>();
final List<SuccessfulUpdate> okUpdates = new ArrayList<>();
final List<ClusterNode> failedNodes = new ArrayList<>();
final Map<URI, ClusterNode> newClients = new HashMap<>();
updates.forEach(update -> {
update.handle(s -> {
if (s.isAdded()) {
log.info("{} [new] {}", id, s.getUri());
}
newClients.put(s.getUri(), s.getNode());
okNodes.add(s.getNode());
okUpdates.add(s);
}, error -> {
log.error("{} [failed] {}", id, error.getUri(), error.getError());
error.getExistingNode().ifPresent(existingNode -> {
failedNodes.add(existingNode);
});
});
});
if (okNodes.isEmpty() && useLocal) {
log.info("{} [refresh] no nodes discovered, including local node", id);
okNodes.add(new TracingClusterNode(local, LOCAL_IDENTIFIER));
}
final Set<Map<String, String>> knownShards = extractKnownShards(okNodes);
log.info("{} [update] {} {} result(s)", id, knownShards, okNodes.size());
/* shutdown removed node */
return refreshFinalize(id, oldClients, newClients, okNodes, okUpdates, removedNodes,
failedNodes);
};
}
/**
* Create a lazy transform that updates the local state of the registry, or attempts another
* refresh if the local state has already been updated.
*
* @param id id of the operation
* @param oldClients map of clients that should be updated from
* @param newClients map of clients that should be updated to
* @param okNodes entries to add to registry
* @param okUpdates list of successful updates
* @param removedNodes list of nodes that should not be a part of cluster anymore
* @param failedNodes list of nodes that failed and should be excluded until they are ok again
* @return a lazy transform
*/
AsyncFuture<Void> refreshFinalize(
final String id, final Map<URI, ClusterNode> oldClients,
final Map<URI, ClusterNode> newClients, final Set<ClusterNode> okNodes,
final List<SuccessfulUpdate> okUpdates, final List<RemovedNode> removedNodes,
final List<ClusterNode> failedNodes
) {
if (this.clients.compareAndSet(oldClients, newClients)) {
synchronized (this.updateRegistryLock) {
registry.getAndSet(
new NodeRegistry(async, new ArrayList<>(okNodes), okNodes.size()));
}
// Close removed nodes
final List<AsyncFuture<Void>> removals = new ArrayList<>();
removedNodes.forEach(removedNode -> {
log.error("{} [remove] {}", id, removedNode.getUri());
removals.add(removedNode.getNode().close());
});
// Close failed nodes
failedNodes.forEach(failedNode -> {
removals.add(failedNode.close());
});
return async.collectAndDiscard(removals);
}
log.warn("{} another refresh in progress, trying again", id);
/* Another refresh was already in progress (our refresh perhaps took unexpectedly long).
* We now need to clean up after the current refresh and retry again.
* Any *new* nodes that we tried to add in this update should be closed, since they were
* not part of the previous registry. */
final List<AsyncFuture<Void>> removals = new ArrayList<>();
removals.addAll(okUpdates
.stream()
.filter(SuccessfulUpdate::isAdded)
.map(s -> s.getNode().close())
.collect(Collectors.toList()));
return async.collectAndDiscard(removals).lazyTransform(v0 -> refreshDiscovery(id));
}
/**
* A container that contains information about a node update.
*/
interface Update {
static Transform<Throwable, Update> error(final URI uri) {
return e -> new FailedUpdate(uri, e, Optional.empty());
}
static Transform<Throwable, Update> error(final URI uri, final ClusterNode existingNode) {
return e -> new FailedUpdate(uri, e, Optional.of(existingNode));
}
static Transform<Void, Update> cancellation(final URI uri) {
return ignore -> new FailedUpdate(uri, new CancellationException(), Optional.empty());
}
/**
* Handle the current update.
*
* @param successful Consumer for an successful update, will be called if the update is
* successful
* @param error Consumer for a failed update, will be called if the update is failed.
*/
void handle(
final Consumer<SuccessfulUpdate> successful, final Consumer<FailedUpdate> error
);
}
/**
* A successful node update.
*/
@Data
static class SuccessfulUpdate implements Update {
/**
* The URI that was updated.
*/
private final URI uri;
/**
* If the update is a new addition.
*/
private final boolean added;
/**
* The cluster node part of the update.
*/
private final ClusterNode node;
@Override
public void handle(
final Consumer<SuccessfulUpdate> successful, final Consumer<FailedUpdate> error
) {
successful.accept(this);
}
}
/**
* A failed node update.
*/
@Data
static class FailedUpdate implements Update {
/**
* URI of the node that failed to update.
*/
private final URI uri;
/**
* The error that caused the failure.
*/
private final Throwable error;
/**
* The existing node, to be closed
*/
private final Optional<ClusterNode> existingNode;
@Override
public void handle(
final Consumer<SuccessfulUpdate> successful, final Consumer<FailedUpdate> error
) {
error.accept(this);
}
}
/**
* A single removed node.
*/
@Data
static class RemovedNode {
/**
* The URI of the removed node.
*/
private final URI uri;
/**
* The cluster node that was removed.
*/
private final ClusterNode node;
}
}