/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.cluster.coordination.node;
import org.apache.commons.collections4.queue.CircularFifoQueue;
import org.apache.commons.lang3.StringUtils;
import org.apache.nifi.cluster.coordination.ClusterCoordinator;
import org.apache.nifi.cluster.coordination.flow.FlowElection;
import org.apache.nifi.cluster.coordination.http.HttpResponseMapper;
import org.apache.nifi.cluster.coordination.http.StandardHttpResponseMapper;
import org.apache.nifi.cluster.coordination.http.replication.RequestCompletionCallback;
import org.apache.nifi.cluster.event.Event;
import org.apache.nifi.cluster.event.NodeEvent;
import org.apache.nifi.cluster.exception.NoClusterCoordinatorException;
import org.apache.nifi.cluster.firewall.ClusterNodeFirewall;
import org.apache.nifi.cluster.manager.NodeResponse;
import org.apache.nifi.cluster.manager.exception.IllegalNodeDisconnectionException;
import org.apache.nifi.cluster.protocol.ComponentRevision;
import org.apache.nifi.cluster.protocol.ConnectionRequest;
import org.apache.nifi.cluster.protocol.ConnectionResponse;
import org.apache.nifi.cluster.protocol.DataFlow;
import org.apache.nifi.cluster.protocol.NodeIdentifier;
import org.apache.nifi.cluster.protocol.NodeProtocolSender;
import org.apache.nifi.cluster.protocol.ProtocolException;
import org.apache.nifi.cluster.protocol.ProtocolHandler;
import org.apache.nifi.cluster.protocol.StandardDataFlow;
import org.apache.nifi.cluster.protocol.impl.ClusterCoordinationProtocolSenderListener;
import org.apache.nifi.cluster.protocol.message.ClusterWorkloadRequestMessage;
import org.apache.nifi.cluster.protocol.message.ClusterWorkloadResponseMessage;
import org.apache.nifi.cluster.protocol.message.ConnectionRequestMessage;
import org.apache.nifi.cluster.protocol.message.ConnectionResponseMessage;
import org.apache.nifi.cluster.protocol.message.DisconnectMessage;
import org.apache.nifi.cluster.protocol.message.NodeConnectionStatusResponseMessage;
import org.apache.nifi.cluster.protocol.message.NodeStatusChangeMessage;
import org.apache.nifi.cluster.protocol.message.ProtocolMessage;
import org.apache.nifi.cluster.protocol.message.ProtocolMessage.MessageType;
import org.apache.nifi.cluster.protocol.message.ReconnectionRequestMessage;
import org.apache.nifi.controller.leader.election.LeaderElectionManager;
import org.apache.nifi.events.EventReporter;
import org.apache.nifi.reporting.Severity;
import org.apache.nifi.services.FlowService;
import org.apache.nifi.util.NiFiProperties;
import org.apache.nifi.web.revision.RevisionManager;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.atomic.AtomicLong;
import java.util.function.Supplier;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
public class NodeClusterCoordinator implements ClusterCoordinator, ProtocolHandler, RequestCompletionCallback {
private static final Logger logger = LoggerFactory.getLogger(NodeClusterCoordinator.class);
private static final String EVENT_CATEGORY = "Clustering";
private static final Pattern COUNTER_URI_PATTERN = Pattern.compile("/nifi-api/counters/[a-f0-9\\-]{36}");
private final String instanceId = UUID.randomUUID().toString();
private volatile NodeIdentifier nodeId;
private final ClusterCoordinationProtocolSenderListener senderListener;
private final EventReporter eventReporter;
private final ClusterNodeFirewall firewall;
private final RevisionManager revisionManager;
private final NiFiProperties nifiProperties;
private final LeaderElectionManager leaderElectionManager;
private final AtomicLong latestUpdateId = new AtomicLong(-1);
private final FlowElection flowElection;
private final NodeProtocolSender nodeProtocolSender;
private volatile FlowService flowService;
private volatile boolean connected;
private volatile boolean closed = false;
private volatile boolean requireElection = true;
private final ConcurrentMap<NodeIdentifier, NodeConnectionStatus> nodeStatuses = new ConcurrentHashMap<>();
private final ConcurrentMap<NodeIdentifier, CircularFifoQueue<NodeEvent>> nodeEvents = new ConcurrentHashMap<>();
public NodeClusterCoordinator(final ClusterCoordinationProtocolSenderListener senderListener, final EventReporter eventReporter, final LeaderElectionManager leaderElectionManager,
final FlowElection flowElection, final ClusterNodeFirewall firewall, final RevisionManager revisionManager, final NiFiProperties nifiProperties,
final NodeProtocolSender nodeProtocolSender) {
this.senderListener = senderListener;
this.flowService = null;
this.eventReporter = eventReporter;
this.firewall = firewall;
this.revisionManager = revisionManager;
this.nifiProperties = nifiProperties;
this.leaderElectionManager = leaderElectionManager;
this.flowElection = flowElection;
this.nodeProtocolSender = nodeProtocolSender;
senderListener.addHandler(this);
}
@Override
public void shutdown() {
if (closed) {
return;
}
closed = true;
final NodeIdentifier localId = getLocalNodeIdentifier();
if (localId != null) {
final NodeConnectionStatus shutdownStatus = new NodeConnectionStatus(localId, DisconnectionCode.NODE_SHUTDOWN);
updateNodeStatus(shutdownStatus, false);
logger.info("Successfully notified other nodes that I am shutting down");
}
}
@Override
public void setLocalNodeIdentifier(final NodeIdentifier nodeId) {
this.nodeId = nodeId;
nodeStatuses.computeIfAbsent(nodeId, id -> new NodeConnectionStatus(id, DisconnectionCode.NOT_YET_CONNECTED));
}
@Override
public NodeIdentifier getLocalNodeIdentifier() {
return nodeId;
}
private NodeIdentifier waitForElectedClusterCoordinator() {
return waitForNodeIdentifier(() -> getElectedActiveCoordinatorNode(false));
}
private NodeIdentifier waitForNodeIdentifier(final Supplier<NodeIdentifier> fetchNodeId) {
NodeIdentifier localNodeId = null;
while (localNodeId == null) {
localNodeId = fetchNodeId.get();
if (localNodeId == null) {
if (closed) {
return null;
}
try {
Thread.sleep(100L);
} catch (final InterruptedException ie) {
Thread.currentThread().interrupt();
return null;
}
}
}
return localNodeId;
}
private String getElectedActiveCoordinatorAddress() throws IOException {
return leaderElectionManager.getLeader(ClusterRoles.CLUSTER_COORDINATOR);
}
@Override
public void resetNodeStatuses(final Map<NodeIdentifier, NodeConnectionStatus> statusMap) {
logger.info("Resetting cluster node statuses from {} to {}", nodeStatuses, statusMap);
// For each proposed replacement, update the nodeStatuses map if and only if the replacement
// has a larger update id than the current value.
for (final Map.Entry<NodeIdentifier, NodeConnectionStatus> entry : statusMap.entrySet()) {
final NodeIdentifier nodeId = entry.getKey();
final NodeConnectionStatus proposedStatus = entry.getValue();
if (proposedStatus.getState() == NodeConnectionState.REMOVED) {
nodeStatuses.remove(nodeId);
} else {
nodeStatuses.put(nodeId, proposedStatus);
}
}
}
@Override
public boolean resetNodeStatus(final NodeConnectionStatus connectionStatus, final long qualifyingUpdateId) {
final NodeIdentifier nodeId = connectionStatus.getNodeIdentifier();
final NodeConnectionStatus currentStatus = getConnectionStatus(nodeId);
if (currentStatus == null) {
return replaceNodeStatus(nodeId, null, connectionStatus);
} else if (currentStatus.getUpdateIdentifier() == qualifyingUpdateId) {
return replaceNodeStatus(nodeId, currentStatus, connectionStatus);
}
// The update identifier is not the same. We will not replace the value
return false;
}
/**
* Attempts to update the nodeStatuses map by changing the value for the
* given node id from the current status to the new status, as in
* ConcurrentMap.replace(nodeId, currentStatus, newStatus) but with the
* difference that this method can handle a <code>null</code> value for
* currentStatus
*
* @param nodeId the node id
* @param currentStatus the current status, or <code>null</code> if there is
* no value currently
* @param newStatus the new status to set
* @return <code>true</code> if the map was updated, false otherwise
*/
private boolean replaceNodeStatus(final NodeIdentifier nodeId, final NodeConnectionStatus currentStatus, final NodeConnectionStatus newStatus) {
if (newStatus == null) {
logger.error("Cannot change node status for {} from {} to {} because new status is null", nodeId, currentStatus, newStatus);
logger.error("", new NullPointerException());
}
if (currentStatus == null) {
if (newStatus.getState() == NodeConnectionState.REMOVED) {
return nodeStatuses.remove(nodeId, currentStatus);
} else {
final NodeConnectionStatus existingValue = nodeStatuses.putIfAbsent(nodeId, newStatus);
return existingValue == null;
}
}
if (newStatus.getState() == NodeConnectionState.REMOVED) {
return nodeStatuses.remove(nodeId, currentStatus);
} else {
return nodeStatuses.replace(nodeId, currentStatus, newStatus);
}
}
@Override
public void requestNodeConnect(final NodeIdentifier nodeId, final String userDn) {
if (requireElection && !flowElection.isElectionComplete() && flowElection.isVoteCounted(nodeId)) {
// If we receive a heartbeat from a node that we already know, we don't want to request that it reconnect
// to the cluster because no flow has yet been elected. However, if the node has not yet voted, we want to send
// a reconnect request because we want this node to cast its vote for the flow, and this happens on connection
logger.debug("Received heartbeat for {} and node is not connected. Will not request node connect to cluster, "
+ "though, because the Flow Election is still in progress", nodeId);
return;
}
if (userDn == null) {
reportEvent(nodeId, Severity.INFO, "Requesting that node connect to cluster");
} else {
reportEvent(nodeId, Severity.INFO, "Requesting that node connect to cluster on behalf of " + userDn);
}
updateNodeStatus(new NodeConnectionStatus(nodeId, NodeConnectionState.CONNECTING, null, null, System.currentTimeMillis()));
// create the request
final ReconnectionRequestMessage request = new ReconnectionRequestMessage();
request.setNodeId(nodeId);
request.setInstanceId(instanceId);
// If we still are requiring that an election take place, we do not want to include our local dataflow, because we don't
// yet know what the cluster's dataflow looks like. However, if we don't require election, then we've connected to the
// cluster, which means that our flow is correct.
final boolean includeDataFlow = !requireElection;
requestReconnectionAsynchronously(request, 10, 5, includeDataFlow);
}
@Override
public void finishNodeConnection(final NodeIdentifier nodeId) {
final NodeConnectionState state = getConnectionState(nodeId);
if (state == null) {
logger.debug("Attempted to finish node connection for {} but node is not known. Requesting that node connect", nodeId);
requestNodeConnect(nodeId, null);
return;
}
if (state == NodeConnectionState.CONNECTED) {
// already connected. Nothing to do.
return;
}
if (state == NodeConnectionState.DISCONNECTED || state == NodeConnectionState.DISCONNECTING) {
logger.debug("Attempted to finish node connection for {} but node state was {}. Requesting that node connect", nodeId, state);
requestNodeConnect(nodeId, null);
return;
}
logger.info("{} is now connected", nodeId);
updateNodeStatus(new NodeConnectionStatus(nodeId, NodeConnectionState.CONNECTED));
}
@Override
public void requestNodeDisconnect(final NodeIdentifier nodeId, final DisconnectionCode disconnectionCode, final String explanation) {
final Set<NodeIdentifier> connectedNodeIds = getNodeIdentifiers(NodeConnectionState.CONNECTED);
if (connectedNodeIds.size() == 1 && connectedNodeIds.contains(nodeId)) {
throw new IllegalNodeDisconnectionException("Cannot disconnect node " + nodeId + " because it is the only node currently connected");
}
logger.info("Requesting that {} disconnect due to {}", nodeId, explanation == null ? disconnectionCode : explanation);
updateNodeStatus(new NodeConnectionStatus(nodeId, disconnectionCode, explanation));
// There is no need to tell the node that it's disconnected if it is due to being
// shutdown, as we will not be able to connect to the node anyway.
if (disconnectionCode == DisconnectionCode.NODE_SHUTDOWN) {
return;
}
final DisconnectMessage request = new DisconnectMessage();
request.setNodeId(nodeId);
request.setExplanation(explanation);
addNodeEvent(nodeId, "Disconnection requested due to " + explanation);
disconnectAsynchronously(request, 10, 5);
}
@Override
public void disconnectionRequestedByNode(final NodeIdentifier nodeId, final DisconnectionCode disconnectionCode, final String explanation) {
logger.info("{} requested disconnection from cluster due to {}", nodeId, explanation == null ? disconnectionCode : explanation);
updateNodeStatus(new NodeConnectionStatus(nodeId, disconnectionCode, explanation));
final Severity severity;
switch (disconnectionCode) {
case STARTUP_FAILURE:
case MISMATCHED_FLOWS:
case UNKNOWN:
severity = Severity.ERROR;
break;
case LACK_OF_HEARTBEAT:
severity = Severity.WARNING;
break;
default:
severity = Severity.INFO;
break;
}
reportEvent(nodeId, severity, "Node disconnected from cluster due to " + explanation);
}
@Override
public void removeNode(final NodeIdentifier nodeId, final String userDn) {
reportEvent(nodeId, Severity.INFO, "User " + userDn + " requested that node be removed from cluster");
nodeStatuses.remove(nodeId);
nodeEvents.remove(nodeId);
notifyOthersOfNodeStatusChange(new NodeConnectionStatus(nodeId, NodeConnectionState.REMOVED));
}
@Override
public NodeConnectionStatus getConnectionStatus(final NodeIdentifier nodeId) {
return nodeStatuses.get(nodeId);
}
private NodeConnectionState getConnectionState(final NodeIdentifier nodeId) {
final NodeConnectionStatus status = getConnectionStatus(nodeId);
return status == null ? null : status.getState();
}
@Override
public List<NodeConnectionStatus> getConnectionStatuses() {
return new ArrayList<>(nodeStatuses.values());
}
@Override
public Map<NodeConnectionState, List<NodeIdentifier>> getConnectionStates() {
final Map<NodeConnectionState, List<NodeIdentifier>> connectionStates = new HashMap<>();
for (final Map.Entry<NodeIdentifier, NodeConnectionStatus> entry : nodeStatuses.entrySet()) {
final NodeConnectionState state = entry.getValue().getState();
final List<NodeIdentifier> nodeIds = connectionStates.computeIfAbsent(state, s -> new ArrayList<NodeIdentifier>());
nodeIds.add(entry.getKey());
}
return connectionStates;
}
@Override
public boolean isBlockedByFirewall(final String hostname) {
return firewall != null && !firewall.isPermissible(hostname);
}
@Override
public void reportEvent(final NodeIdentifier nodeId, final Severity severity, final String event) {
eventReporter.reportEvent(severity, EVENT_CATEGORY, nodeId == null ? event : "Event Reported for " + nodeId + " -- " + event);
if (nodeId != null) {
addNodeEvent(nodeId, severity, event);
}
final String message = nodeId == null ? event : "Event Reported for " + nodeId + " -- " + event;
switch (severity) {
case ERROR:
logger.error(message);
break;
case WARNING:
logger.warn(message);
break;
case INFO:
logger.info(message);
break;
}
}
@Override
public NodeIdentifier getNodeIdentifier(final String uuid) {
for (final NodeIdentifier nodeId : nodeStatuses.keySet()) {
if (nodeId.getId().equals(uuid)) {
return nodeId;
}
}
return null;
}
@Override
public Set<NodeIdentifier> getNodeIdentifiers(final NodeConnectionState... states) {
final Set<NodeConnectionState> statesOfInterest = new HashSet<>();
if (states.length == 0) {
for (final NodeConnectionState state : NodeConnectionState.values()) {
statesOfInterest.add(state);
}
} else {
for (final NodeConnectionState state : states) {
statesOfInterest.add(state);
}
}
return nodeStatuses.entrySet().stream()
.filter(entry -> statesOfInterest.contains(entry.getValue().getState()))
.map(entry -> entry.getKey())
.collect(Collectors.toSet());
}
@Override
public NodeIdentifier getPrimaryNode() {
final String primaryNodeAddress = leaderElectionManager.getLeader(ClusterRoles.PRIMARY_NODE);
if (primaryNodeAddress == null) {
return null;
}
return nodeStatuses.keySet().stream()
.filter(nodeId -> primaryNodeAddress.equals(nodeId.getSocketAddress() + ":" + nodeId.getSocketPort()))
.findFirst()
.orElse(null);
}
@Override
public NodeIdentifier getElectedActiveCoordinatorNode() {
return getElectedActiveCoordinatorNode(true);
}
private NodeIdentifier getElectedActiveCoordinatorNode(final boolean warnOnError) {
final String electedNodeAddress;
try {
electedNodeAddress = getElectedActiveCoordinatorAddress();
} catch (final NoClusterCoordinatorException ncce) {
logger.debug("There is currently no elected active Cluster Coordinator");
return null;
} catch (final IOException ioe) {
if (warnOnError) {
logger.warn("Failed to determine which node is elected active Cluster Coordinator. There may be no coordinator currently: " + ioe);
if (logger.isDebugEnabled()) {
logger.warn("", ioe);
}
}
return null;
}
if (electedNodeAddress == null) {
logger.debug("There is currently no elected active Cluster Coordinator");
return null;
}
final int colonLoc = electedNodeAddress.indexOf(':');
if (colonLoc < 1) {
if (warnOnError) {
logger.warn("Failed to determine which node is elected active Cluster Coordinator: ZooKeeper reports the address as {}, but this is not a valid address", electedNodeAddress);
}
return null;
}
final String electedNodeHostname = electedNodeAddress.substring(0, colonLoc);
final String portString = electedNodeAddress.substring(colonLoc + 1);
final int electedNodePort;
try {
electedNodePort = Integer.parseInt(portString);
} catch (final NumberFormatException nfe) {
if (warnOnError) {
logger.warn("Failed to determine which node is elected active Cluster Coordinator: ZooKeeper reports the address as {}, but this is not a valid address", electedNodeAddress);
}
return null;
}
final Set<NodeIdentifier> connectedNodeIds = getNodeIdentifiers();
final NodeIdentifier electedNodeId = connectedNodeIds.stream()
.filter(nodeId -> nodeId.getSocketAddress().equals(electedNodeHostname) && nodeId.getSocketPort() == electedNodePort)
.findFirst()
.orElse(null);
if (electedNodeId == null && warnOnError) {
logger.debug("Failed to determine which node is elected active Cluster Coordinator: ZooKeeper reports the address as {},"
+ "but there is no node with this address. Will attempt to communicate with node to determine its information", electedNodeAddress);
try {
final NodeConnectionStatus connectionStatus = senderListener.requestNodeConnectionStatus(electedNodeHostname, electedNodePort);
logger.debug("Received NodeConnectionStatus {}", connectionStatus);
if (connectionStatus == null) {
return null;
}
final NodeConnectionStatus existingStatus = this.nodeStatuses.putIfAbsent(connectionStatus.getNodeIdentifier(), connectionStatus);
if (existingStatus == null) {
return connectionStatus.getNodeIdentifier();
} else {
return existingStatus.getNodeIdentifier();
}
} catch (final Exception e) {
logger.warn("Failed to determine which node is elected active Cluster Coordinator: ZooKeeper reports the address as {}, but there is no node with this address. "
+ "Attempted to determine the node's information but failed to retrieve its information due to {}", electedNodeAddress, e.toString());
if (logger.isDebugEnabled()) {
logger.warn("", e);
}
}
}
return electedNodeId;
}
@Override
public boolean isActiveClusterCoordinator() {
final NodeIdentifier self = getLocalNodeIdentifier();
return self != null && self.equals(getElectedActiveCoordinatorNode());
}
@Override
public List<NodeEvent> getNodeEvents(final NodeIdentifier nodeId) {
final CircularFifoQueue<NodeEvent> eventQueue = nodeEvents.get(nodeId);
if (eventQueue == null) {
return Collections.emptyList();
}
synchronized (eventQueue) {
return new ArrayList<>(eventQueue);
}
}
@Override
public void setFlowService(final FlowService flowService) {
if (this.flowService != null) {
throw new IllegalStateException("Flow Service has already been set");
}
this.flowService = flowService;
}
private void addNodeEvent(final NodeIdentifier nodeId, final String event) {
addNodeEvent(nodeId, Severity.INFO, event);
}
private void addNodeEvent(final NodeIdentifier nodeId, final Severity severity, final String message) {
final NodeEvent event = new Event(nodeId.toString(), message, severity);
final CircularFifoQueue<NodeEvent> eventQueue = nodeEvents.computeIfAbsent(nodeId, id -> new CircularFifoQueue<>());
synchronized (eventQueue) {
eventQueue.add(event);
}
}
/**
* Updates the status of the node with the given ID to the given status and
* returns <code>true</code> if successful, <code>false</code> if no node
* exists with the given ID
*
* @param status the new status of the node
*/
// visible for testing.
void updateNodeStatus(final NodeConnectionStatus status) {
updateNodeStatus(status, true);
}
void updateNodeStatus(final NodeConnectionStatus status, final boolean waitForCoordinator) {
final NodeIdentifier nodeId = status.getNodeIdentifier();
// In this case, we are using nodeStatuses.put() instead of getting the current value and
// comparing that to the new value and using the one with the largest update id. This is because
// this method is called when something occurs that causes this node to change the status of the
// node in question. We only use comparisons against the current value when we receive an update
// about a node status from a different node, since those may be received out-of-order.
final NodeConnectionStatus currentStatus = nodeStatuses.put(nodeId, status);
final NodeConnectionState currentState = currentStatus == null ? null : currentStatus.getState();
logger.info("Status of {} changed from {} to {}", nodeId, currentStatus, status);
logger.debug("State of cluster nodes is now {}", nodeStatuses);
latestUpdateId.updateAndGet(curVal -> Math.max(curVal, status.getUpdateIdentifier()));
if (currentState == null || currentState != status.getState()) {
final boolean notifyAllNodes = isActiveClusterCoordinator();
if (notifyAllNodes) {
logger.debug("Notifying all nodes that status changed from {} to {}", currentStatus, status);
} else {
logger.debug("Notifying cluster coordinator that node status changed from {} to {}", currentStatus, status);
}
notifyOthersOfNodeStatusChange(status, notifyAllNodes, waitForCoordinator);
} else {
logger.debug("Not notifying other nodes that status changed because previous state of {} is same as new state of {}", currentState, status.getState());
}
}
void notifyOthersOfNodeStatusChange(final NodeConnectionStatus updatedStatus) {
notifyOthersOfNodeStatusChange(updatedStatus, isActiveClusterCoordinator(), true);
}
/**
* Notifies other nodes that the status of a node changed
*
* @param updatedStatus the updated status for a node in the cluster
* @param notifyAllNodes if <code>true</code> will notify all nodes. If
* <code>false</code>, will notify only the cluster coordinator
*/
void notifyOthersOfNodeStatusChange(final NodeConnectionStatus updatedStatus, final boolean notifyAllNodes, final boolean waitForCoordinator) {
// If this node is the active cluster coordinator, then we are going to replicate to all nodes.
// Otherwise, get the active coordinator (or wait for one to become active) and then notify the coordinator.
final Set<NodeIdentifier> nodesToNotify;
if (notifyAllNodes) {
nodesToNotify = getNodeIdentifiers(NodeConnectionState.CONNECTED, NodeConnectionState.CONNECTING);
// Do not notify ourselves because we already know about the status update.
nodesToNotify.remove(getLocalNodeIdentifier());
} else if (waitForCoordinator) {
nodesToNotify = Collections.singleton(waitForElectedClusterCoordinator());
} else {
final NodeIdentifier nodeId = getElectedActiveCoordinatorNode();
if (nodeId == null) {
return;
}
nodesToNotify = Collections.singleton(nodeId);
}
final NodeStatusChangeMessage message = new NodeStatusChangeMessage();
message.setNodeId(updatedStatus.getNodeIdentifier());
message.setNodeConnectionStatus(updatedStatus);
senderListener.notifyNodeStatusChange(nodesToNotify, message);
}
private void disconnectAsynchronously(final DisconnectMessage request, final int attempts, final int retrySeconds) {
final Thread disconnectThread = new Thread(new Runnable() {
@Override
public void run() {
final NodeIdentifier nodeId = request.getNodeId();
for (int i = 0; i < attempts; i++) {
try {
senderListener.disconnect(request);
reportEvent(nodeId, Severity.INFO, "Node disconnected due to " + request.getExplanation());
return;
} catch (final Exception e) {
logger.error("Failed to notify {} that it has been disconnected from the cluster due to {}", request.getNodeId(), request.getExplanation());
try {
Thread.sleep(retrySeconds * 1000L);
} catch (final InterruptedException ie) {
Thread.currentThread().interrupt();
return;
}
}
}
}
}, "Disconnect " + request.getNodeId());
disconnectThread.start();
}
private void requestReconnectionAsynchronously(final ReconnectionRequestMessage request, final int reconnectionAttempts, final int retrySeconds, final boolean includeDataFlow) {
final Thread reconnectionThread = new Thread(new Runnable() {
@Override
public void run() {
// create the request
while (flowService == null) {
try {
Thread.sleep(100L);
} catch (final InterruptedException ie) {
logger.info("Could not send Reconnection request to {} because thread was "
+ "interrupted before FlowService was made available", request.getNodeId());
Thread.currentThread().interrupt();
return;
}
}
for (int i = 0; i < reconnectionAttempts; i++) {
try {
if (NodeConnectionState.CONNECTING != getConnectionState(request.getNodeId())) {
// the node status has changed. It's no longer appropriate to attempt reconnection.
return;
}
if (includeDataFlow) {
request.setDataFlow(new StandardDataFlow(flowService.createDataFlowFromController()));
}
request.setNodeConnectionStatuses(getConnectionStatuses());
request.setComponentRevisions(revisionManager.getAllRevisions().stream().map(rev -> ComponentRevision.fromRevision(rev)).collect(Collectors.toList()));
// Issue a reconnection request to the node.
senderListener.requestReconnection(request);
// successfully told node to reconnect -- we're done!
logger.info("Successfully requested that {} join the cluster", request.getNodeId());
return;
} catch (final Exception e) {
logger.warn("Problem encountered issuing reconnection request to node " + request.getNodeId(), e);
eventReporter.reportEvent(Severity.WARNING, EVENT_CATEGORY, "Problem encountered issuing reconnection request to node "
+ request.getNodeId() + " due to: " + e);
}
try {
Thread.sleep(1000L * retrySeconds);
} catch (final InterruptedException ie) {
break;
}
}
// We failed to reconnect too many times. We must now mark node as disconnected.
if (NodeConnectionState.CONNECTING == getConnectionState(request.getNodeId())) {
requestNodeDisconnect(request.getNodeId(), DisconnectionCode.UNABLE_TO_COMMUNICATE,
"Attempted to request that node reconnect to cluster but could not communicate with node");
}
}
}, "Reconnect " + request.getNodeId());
reconnectionThread.start();
}
@Override
public ProtocolMessage handle(final ProtocolMessage protocolMessage) throws ProtocolException {
switch (protocolMessage.getType()) {
case CONNECTION_REQUEST:
return handleConnectionRequest((ConnectionRequestMessage) protocolMessage);
case NODE_STATUS_CHANGE:
handleNodeStatusChange((NodeStatusChangeMessage) protocolMessage);
return null;
case NODE_CONNECTION_STATUS_REQUEST:
return handleNodeConnectionStatusRequest();
default:
throw new ProtocolException("Cannot handle Protocol Message " + protocolMessage + " because it is not of the correct type");
}
}
private NodeConnectionStatusResponseMessage handleNodeConnectionStatusRequest() {
final NodeConnectionStatusResponseMessage msg = new NodeConnectionStatusResponseMessage();
final NodeIdentifier self = getLocalNodeIdentifier();
if (self != null) {
final NodeConnectionStatus connectionStatus = nodeStatuses.get(self);
msg.setNodeConnectionStatus(connectionStatus);
}
return msg;
}
private String summarizeStatusChange(final NodeConnectionStatus oldStatus, final NodeConnectionStatus status) {
final StringBuilder sb = new StringBuilder();
if (oldStatus == null || status.getState() != oldStatus.getState()) {
sb.append("Node Status changed from ").append(oldStatus == null ? "[Unknown Node]" : oldStatus.getState().toString()).append(" to ").append(status.getState().toString());
if (status.getDisconnectReason() != null) {
sb.append(" due to ").append(status.getDisconnectReason());
} else if (status.getDisconnectCode() != null) {
sb.append(" due to ").append(status.getDisconnectCode().toString());
}
}
return sb.toString();
}
private void handleNodeStatusChange(final NodeStatusChangeMessage statusChangeMessage) {
final NodeConnectionStatus updatedStatus = statusChangeMessage.getNodeConnectionStatus();
final NodeIdentifier nodeId = statusChangeMessage.getNodeId();
logger.debug("Handling request {}", statusChangeMessage);
final NodeConnectionStatus oldStatus = nodeStatuses.get(statusChangeMessage.getNodeId());
// Either remove the value from the map or update the map depending on the connection state
if (statusChangeMessage.getNodeConnectionStatus().getState() == NodeConnectionState.REMOVED) {
nodeStatuses.remove(nodeId, oldStatus);
} else {
nodeStatuses.put(nodeId, updatedStatus);
}
logger.info("Status of {} changed from {} to {}", statusChangeMessage.getNodeId(), oldStatus, updatedStatus);
logger.debug("State of cluster nodes is now {}", nodeStatuses);
final NodeConnectionStatus status = statusChangeMessage.getNodeConnectionStatus();
final String summary = summarizeStatusChange(oldStatus, status);
if (!StringUtils.isEmpty(summary)) {
addNodeEvent(nodeId, summary);
}
// Update our counter so that we are in-sync with the cluster on the
// most up-to-date version of the NodeConnectionStatus' Update Identifier.
// We do this so that we can accurately compare status updates that are generated
// locally against those generated from other nodes in the cluster.
NodeConnectionStatus.updateIdGenerator(updatedStatus.getUpdateIdentifier());
if (isActiveClusterCoordinator()) {
notifyOthersOfNodeStatusChange(statusChangeMessage.getNodeConnectionStatus());
}
}
@Override
public String getFlowElectionStatus() {
if (!requireElection) {
return null;
}
return flowElection.getStatusDescription();
}
@Override
public boolean isFlowElectionComplete() {
return !requireElection || flowElection.isElectionComplete();
}
private NodeIdentifier resolveNodeId(final NodeIdentifier proposedIdentifier) {
final NodeConnectionStatus proposedConnectionStatus = new NodeConnectionStatus(proposedIdentifier, DisconnectionCode.NOT_YET_CONNECTED);
final NodeConnectionStatus existingStatus = nodeStatuses.putIfAbsent(proposedIdentifier, proposedConnectionStatus);
NodeIdentifier resolvedNodeId = proposedIdentifier;
if (existingStatus == null) {
// there is no node with that ID
resolvedNodeId = proposedIdentifier;
logger.debug("No existing node with ID {}; resolved node ID is as-proposed", proposedIdentifier.getId());
} else if (existingStatus.getNodeIdentifier().logicallyEquals(proposedIdentifier)) {
// there is a node with that ID but it's the same node.
resolvedNodeId = proposedIdentifier;
logger.debug("No existing node with ID {}; resolved node ID is as-proposed", proposedIdentifier.getId());
} else {
// there is a node with that ID and it's a different node
resolvedNodeId = new NodeIdentifier(UUID.randomUUID().toString(), proposedIdentifier.getApiAddress(), proposedIdentifier.getApiPort(),
proposedIdentifier.getSocketAddress(), proposedIdentifier.getSocketPort(), proposedIdentifier.getSiteToSiteAddress(),
proposedIdentifier.getSiteToSitePort(), proposedIdentifier.getSiteToSiteHttpApiPort(), proposedIdentifier.isSiteToSiteSecure());
logger.debug("A node already exists with ID {}. Proposed Node Identifier was {}; existing Node Identifier is {}; Resolved Node Identifier is {}",
proposedIdentifier.getId(), proposedIdentifier, getNodeIdentifier(proposedIdentifier.getId()), resolvedNodeId);
}
return resolvedNodeId;
}
private ConnectionResponseMessage handleConnectionRequest(final ConnectionRequestMessage requestMessage) {
final NodeIdentifier proposedIdentifier = requestMessage.getConnectionRequest().getProposedNodeIdentifier();
final NodeIdentifier withRequestorDn = addRequestorDn(proposedIdentifier, requestMessage.getRequestorDN());
final DataFlow dataFlow = requestMessage.getConnectionRequest().getDataFlow();
final ConnectionRequest requestWithDn = new ConnectionRequest(withRequestorDn, dataFlow);
// Resolve Node identifier.
final NodeIdentifier resolvedNodeId = resolveNodeId(proposedIdentifier);
if (requireElection) {
final DataFlow electedDataFlow = flowElection.castVote(dataFlow, withRequestorDn);
if (electedDataFlow == null) {
logger.info("Received Connection Request from {}; responding with Flow Election In Progress message", withRequestorDn);
return createFlowElectionInProgressResponse();
} else {
logger.info("Received Connection Request from {}; responding with DataFlow that was elected", withRequestorDn);
return createConnectionResponse(requestWithDn, resolvedNodeId, electedDataFlow);
}
}
logger.info("Received Connection Request from {}; responding with my DataFlow", withRequestorDn);
return createConnectionResponse(requestWithDn, resolvedNodeId);
}
private ConnectionResponseMessage createFlowElectionInProgressResponse() {
final ConnectionResponseMessage responseMessage = new ConnectionResponseMessage();
final String statusDescription = flowElection.getStatusDescription();
responseMessage.setConnectionResponse(new ConnectionResponse(5, "Cluster is still voting on which Flow is the correct flow for the cluster. " + statusDescription));
return responseMessage;
}
private ConnectionResponseMessage createConnectionResponse(final ConnectionRequest request, final NodeIdentifier resolvedNodeIdentifier) {
DataFlow dataFlow = null;
if (flowService != null) {
try {
dataFlow = flowService.createDataFlowFromController();
} catch (final IOException ioe) {
logger.error("Unable to obtain current dataflow from FlowService in order to provide the flow to "
+ resolvedNodeIdentifier + ". Will tell node to try again later", ioe);
}
}
return createConnectionResponse(request, resolvedNodeIdentifier, dataFlow);
}
private ConnectionResponseMessage createConnectionResponse(final ConnectionRequest request, final NodeIdentifier resolvedNodeIdentifier, final DataFlow clusterDataFlow) {
if (isBlockedByFirewall(resolvedNodeIdentifier.getSocketAddress())) {
// if the socket address is not listed in the firewall, then return a null response
logger.info("Firewall blocked connection request from node " + resolvedNodeIdentifier);
final ConnectionResponse response = ConnectionResponse.createBlockedByFirewallResponse();
final ConnectionResponseMessage responseMessage = new ConnectionResponseMessage();
responseMessage.setConnectionResponse(response);
return responseMessage;
}
if (clusterDataFlow == null) {
final ConnectionResponseMessage responseMessage = new ConnectionResponseMessage();
responseMessage.setConnectionResponse(new ConnectionResponse(5, "The cluster dataflow is not yet available"));
return responseMessage;
}
// Set node's status to 'CONNECTING'
NodeConnectionStatus status = getConnectionStatus(resolvedNodeIdentifier);
if (status == null) {
addNodeEvent(resolvedNodeIdentifier, "Connection requested from new node. Setting status to connecting.");
} else {
addNodeEvent(resolvedNodeIdentifier, "Connection requested from existing node. Setting status to connecting.");
}
status = new NodeConnectionStatus(resolvedNodeIdentifier, NodeConnectionState.CONNECTING, null, null, System.currentTimeMillis());
updateNodeStatus(status);
final ConnectionResponse response = new ConnectionResponse(resolvedNodeIdentifier, clusterDataFlow, instanceId, getConnectionStatuses(),
revisionManager.getAllRevisions().stream().map(rev -> ComponentRevision.fromRevision(rev)).collect(Collectors.toList()));
final ConnectionResponseMessage responseMessage = new ConnectionResponseMessage();
responseMessage.setConnectionResponse(response);
return responseMessage;
}
private NodeIdentifier addRequestorDn(final NodeIdentifier nodeId, final String dn) {
return new NodeIdentifier(nodeId.getId(), nodeId.getApiAddress(), nodeId.getApiPort(),
nodeId.getSocketAddress(), nodeId.getSocketPort(),
nodeId.getSiteToSiteAddress(), nodeId.getSiteToSitePort(),
nodeId.getSiteToSiteHttpApiPort(), nodeId.isSiteToSiteSecure(), dn);
}
@Override
public boolean canHandle(final ProtocolMessage msg) {
return MessageType.CONNECTION_REQUEST == msg.getType() || MessageType.NODE_STATUS_CHANGE == msg.getType()
|| MessageType.NODE_CONNECTION_STATUS_REQUEST == msg.getType();
}
private boolean isMutableRequest(final String method) {
return "DELETE".equalsIgnoreCase(method) || "POST".equalsIgnoreCase(method) || "PUT".equalsIgnoreCase(method);
}
/**
* Callback that is called after an HTTP Request has been replicated to
* nodes in the cluster. This allows us to disconnect nodes that did not
* complete the request, if applicable.
*/
@Override
public void afterRequest(final String uriPath, final String method, final Set<NodeResponse> nodeResponses) {
// if we are not the active cluster coordinator, then we are not responsible for monitoring the responses,
// as the cluster coordinator is responsible for performing the actual request replication.
if (!isActiveClusterCoordinator()) {
return;
}
final boolean mutableRequest = isMutableRequest(method);
/*
* Nodes that encountered issues handling the request are marked as
* disconnected for mutable requests (e.g., post, put, delete). For
* other requests (e.g., get, head), the nodes remain in their current
* state even if they had problems handling the request.
*/
if (mutableRequest) {
final HttpResponseMapper responseMerger = new StandardHttpResponseMapper(nifiProperties);
final Set<NodeResponse> problematicNodeResponses = responseMerger.getProblematicNodeResponses(nodeResponses);
// all nodes failed
final boolean allNodesFailed = problematicNodeResponses.size() == nodeResponses.size();
// some nodes had a problematic response because of a missing counter, ensure the are not disconnected
final boolean someNodesFailedMissingCounter = !problematicNodeResponses.isEmpty()
&& problematicNodeResponses.size() < nodeResponses.size() && isMissingCounter(problematicNodeResponses, uriPath);
// ensure nodes stay connected in certain scenarios
if (allNodesFailed) {
logger.warn("All nodes failed to process URI {} {}. As a result, no node will be disconnected from cluster", method, uriPath);
return;
}
if (someNodesFailedMissingCounter) {
return;
}
// disconnect problematic nodes
if (!problematicNodeResponses.isEmpty() && problematicNodeResponses.size() < nodeResponses.size()) {
final Set<NodeIdentifier> failedNodeIds = problematicNodeResponses.stream().map(response -> response.getNodeId()).collect(Collectors.toSet());
logger.warn(String.format("The following nodes failed to process URI %s '%s'. Requesting each node disconnect from cluster.", uriPath, failedNodeIds));
for (final NodeIdentifier nodeId : failedNodeIds) {
requestNodeDisconnect(nodeId, DisconnectionCode.FAILED_TO_SERVICE_REQUEST, "Failed to process request " + method + " " + uriPath);
}
}
}
}
/**
* Determines if all problematic responses were due to 404 NOT_FOUND.
* Assumes that problematicNodeResponses is not empty and is not comprised
* of responses from all nodes in the cluster (at least one node contained
* the counter in question).
*
* @param problematicNodeResponses The problematic node responses
* @param uriPath The path of the URI for the request
* @return Whether all problematic node responses were due to a missing
* counter
*/
private boolean isMissingCounter(final Set<NodeResponse> problematicNodeResponses, final String uriPath) {
if (COUNTER_URI_PATTERN.matcher(uriPath).matches()) {
boolean notFound = true;
for (final NodeResponse problematicResponse : problematicNodeResponses) {
if (problematicResponse.getStatus() != 404) {
notFound = false;
break;
}
}
return notFound;
}
return false;
}
@Override
public void setConnected(final boolean connected) {
this.connected = connected;
// Once we have connected to the cluster, election is no longer required.
// It is required only upon startup so that if multiple nodes are started up
// at the same time, and they have different flows, that we don't choose the
// wrong flow as the 'golden copy' by electing that node as the elected
// active Cluster Coordinator.
if (connected) {
logger.info("This node is now connected to the cluster. Will no longer require election of DataFlow.");
requireElection = false;
}
}
@Override
public boolean isConnected() {
return connected;
}
@Override
public Map<NodeIdentifier, NodeWorkload> getClusterWorkload() throws IOException {
final ClusterWorkloadRequestMessage request = new ClusterWorkloadRequestMessage();
final ClusterWorkloadResponseMessage response = nodeProtocolSender.clusterWorkload(request);
return response.getNodeWorkloads();
}
}