/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.cluster.coordination.heartbeat;
import org.apache.nifi.cluster.coordination.ClusterCoordinator;
import org.apache.nifi.cluster.coordination.node.DisconnectionCode;
import org.apache.nifi.cluster.coordination.node.NodeConnectionState;
import org.apache.nifi.cluster.coordination.node.NodeConnectionStatus;
import org.apache.nifi.cluster.protocol.NodeIdentifier;
import org.apache.nifi.engine.FlowEngine;
import org.apache.nifi.reporting.Severity;
import org.apache.nifi.util.FormatUtils;
import org.apache.nifi.util.NiFiProperties;
import org.apache.nifi.util.StopWatch;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Map;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.TimeUnit;
public abstract class AbstractHeartbeatMonitor implements HeartbeatMonitor {
private final int heartbeatIntervalMillis;
private static final Logger logger = LoggerFactory.getLogger(AbstractHeartbeatMonitor.class);
protected final ClusterCoordinator clusterCoordinator;
protected final FlowEngine flowEngine = new FlowEngine(1, "Heartbeat Monitor", true);
private volatile ScheduledFuture<?> future;
private volatile boolean stopped = true;
public AbstractHeartbeatMonitor(final ClusterCoordinator clusterCoordinator, final NiFiProperties nifiProperties) {
this.clusterCoordinator = clusterCoordinator;
final String heartbeatInterval = nifiProperties.getProperty(NiFiProperties.CLUSTER_PROTOCOL_HEARTBEAT_INTERVAL,
NiFiProperties.DEFAULT_CLUSTER_PROTOCOL_HEARTBEAT_INTERVAL);
this.heartbeatIntervalMillis = (int) FormatUtils.getTimeDuration(heartbeatInterval, TimeUnit.MILLISECONDS);
}
@Override
public synchronized final void start() {
if (!stopped) {
logger.info("Attempted to start Heartbeat Monitor but it is already started. Stopping heartbeat monitor and re-starting it.");
stop();
}
stopped = false;
logger.info("Heartbeat Monitor started");
try {
onStart();
} catch (final Exception e) {
logger.error("Failed to start Heartbeat Monitor", e);
}
this.future = flowEngine.scheduleWithFixedDelay(new Runnable() {
@Override
public void run() {
try {
monitorHeartbeats();
} catch (final Exception e) {
clusterCoordinator.reportEvent(null, Severity.ERROR, "Failed to process heartbeats from nodes due to " + e.toString());
logger.error("Failed to process heartbeats", e);
}
}
}, heartbeatIntervalMillis, heartbeatIntervalMillis, TimeUnit.MILLISECONDS);
}
@Override
public synchronized final void stop() {
if (stopped) {
return;
}
this.stopped = true;
logger.info("Heartbeat Monitor stopped");
try {
if (future != null) {
future.cancel(true);
}
} finally {
onStop();
}
}
protected boolean isStopped() {
return stopped;
}
@Override
public NodeHeartbeat getLatestHeartbeat(final NodeIdentifier nodeId) {
return getLatestHeartbeats().get(nodeId);
}
protected ClusterCoordinator getClusterCoordinator() {
return clusterCoordinator;
}
protected long getHeartbeatInterval(final TimeUnit timeUnit) {
return timeUnit.convert(heartbeatIntervalMillis, TimeUnit.MILLISECONDS);
}
/**
* Fetches all of the latest heartbeats and updates the Cluster Coordinator
* as appropriate, based on the heartbeats received.
*
* Visible for testing.
*/
protected synchronized void monitorHeartbeats() {
final NodeIdentifier activeCoordinator = clusterCoordinator.getElectedActiveCoordinatorNode();
if (activeCoordinator != null && !activeCoordinator.equals(clusterCoordinator.getLocalNodeIdentifier())) {
// Occasionally Curator appears to not notify us that we have lost the elected leader role, or does so
// on a very large delay. So before we kick the node out of the cluster, we want to first check what the
// ZNode in ZooKeeper says, and ensure that this is the node that is being advertised as the appropriate
// destination for heartbeats.
logger.debug("It appears that this node is no longer the actively elected cluster coordinator. Will not request that node disconnect.");
return;
}
final Map<NodeIdentifier, NodeHeartbeat> latestHeartbeats = getLatestHeartbeats();
if (latestHeartbeats == null || latestHeartbeats.isEmpty()) {
logger.debug("Received no new heartbeats. Will not disconnect any nodes due to lack of heartbeat");
return;
}
final StopWatch procStopWatch = new StopWatch(true);
for (final NodeHeartbeat heartbeat : latestHeartbeats.values()) {
try {
processHeartbeat(heartbeat);
} catch (final Exception e) {
clusterCoordinator.reportEvent(null, Severity.ERROR,
"Received heartbeat from " + heartbeat.getNodeIdentifier() + " but failed to process heartbeat due to " + e);
logger.error("Failed to process heartbeat from {} due to {}", heartbeat.getNodeIdentifier(), e.toString());
logger.error("", e);
}
}
procStopWatch.stop();
logger.info("Finished processing {} heartbeats in {}", latestHeartbeats.size(), procStopWatch.getDuration());
// Disconnect any node that hasn't sent a heartbeat in a long time (8 times the heartbeat interval)
final long maxMillis = heartbeatIntervalMillis * 8;
final long threshold = System.currentTimeMillis() - maxMillis;
for (final NodeHeartbeat heartbeat : latestHeartbeats.values()) {
if (heartbeat.getTimestamp() < threshold) {
final long secondsSinceLastHeartbeat = TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis() - heartbeat.getTimestamp());
clusterCoordinator.disconnectionRequestedByNode(heartbeat.getNodeIdentifier(), DisconnectionCode.LACK_OF_HEARTBEAT,
"Have not received a heartbeat from node in " + secondsSinceLastHeartbeat + " seconds");
try {
removeHeartbeat(heartbeat.getNodeIdentifier());
} catch (final Exception e) {
logger.warn("Failed to remove heartbeat for {} due to {}", heartbeat.getNodeIdentifier(), e.toString());
logger.warn("", e);
}
}
}
}
private void processHeartbeat(final NodeHeartbeat heartbeat) {
final NodeIdentifier nodeId = heartbeat.getNodeIdentifier();
// Do not process heartbeat if it's blocked by firewall.
if (clusterCoordinator.isBlockedByFirewall(nodeId.getSocketAddress())) {
clusterCoordinator.reportEvent(nodeId, Severity.WARNING, "Firewall blocked received heartbeat. Issuing disconnection request.");
// request node to disconnect
clusterCoordinator.requestNodeDisconnect(nodeId, DisconnectionCode.BLOCKED_BY_FIREWALL, "Blocked by Firewall");
removeHeartbeat(nodeId);
return;
}
final NodeConnectionStatus connectionStatus = clusterCoordinator.getConnectionStatus(nodeId);
if (connectionStatus == null) {
// Unknown node. Issue reconnect request
clusterCoordinator.reportEvent(nodeId, Severity.INFO, "Received heartbeat from unknown node. Removing heartbeat and requesting that node connect to cluster.");
removeHeartbeat(nodeId);
clusterCoordinator.requestNodeConnect(nodeId, null);
return;
}
final NodeConnectionState connectionState = connectionStatus.getState();
if (heartbeat.getConnectionStatus().getState() != NodeConnectionState.CONNECTED && connectionState == NodeConnectionState.CONNECTED) {
// Cluster Coordinator believes that node is connected, but node does not believe so.
clusterCoordinator.reportEvent(nodeId, Severity.WARNING, "Received heartbeat from node that thinks it is not yet part of the cluster,"
+ "though the Cluster Coordinator thought it was (node claimed state was " + heartbeat.getConnectionStatus().getState()
+ "). Marking as Disconnected and requesting that Node reconnect to cluster");
clusterCoordinator.requestNodeConnect(nodeId, null);
return;
}
if (NodeConnectionState.DISCONNECTED == connectionState) {
// ignore heartbeats from nodes disconnected by means other than lack of heartbeat, unless it is
// the only node. We allow it if it is the only node because if we have a one-node cluster, then
// we cannot manually reconnect it.
final DisconnectionCode disconnectionCode = connectionStatus.getDisconnectCode();
// Determine whether or not the node should be allowed to be in the cluster still, depending on its reason for disconnection.
switch (disconnectionCode) {
case LACK_OF_HEARTBEAT:
case UNABLE_TO_COMMUNICATE:
case NOT_YET_CONNECTED:
case STARTUP_FAILURE: {
clusterCoordinator.reportEvent(nodeId, Severity.INFO, "Received heartbeat from node previously "
+ "disconnected due to " + disconnectionCode + ". Issuing reconnection request.");
clusterCoordinator.requestNodeConnect(nodeId, null);
break;
}
default: {
// disconnected nodes should not heartbeat, so we need to issue a disconnection request.
logger.info("Ignoring received heartbeat from disconnected node " + nodeId + ". Issuing disconnection request.");
clusterCoordinator.requestNodeDisconnect(nodeId, disconnectionCode, connectionStatus.getDisconnectReason());
removeHeartbeat(nodeId);
break;
}
}
return;
}
if (NodeConnectionState.DISCONNECTING == connectionStatus.getState()) {
// ignore spurious heartbeat
removeHeartbeat(nodeId);
return;
}
// first heartbeat causes status change from connecting to connected
if (NodeConnectionState.CONNECTING == connectionState) {
final Long connectionRequestTime = connectionStatus.getConnectionRequestTime();
if (connectionRequestTime != null && heartbeat.getTimestamp() < connectionRequestTime) {
clusterCoordinator.reportEvent(nodeId, Severity.INFO, "Received heartbeat but ignoring because it was reported before the node was last asked to reconnect.");
removeHeartbeat(nodeId);
return;
}
// connection complete
clusterCoordinator.finishNodeConnection(nodeId);
clusterCoordinator.reportEvent(nodeId, Severity.INFO, "Received first heartbeat from connecting node. Node connected.");
}
}
/**
* @return the most recent heartbeat information for each node in the
* cluster
*/
protected abstract Map<NodeIdentifier, NodeHeartbeat> getLatestHeartbeats();
/**
* This method does nothing in the abstract class but is meant for
* subclasses to override in order to provide functionality when the monitor
* is started.
*/
protected void onStart() {
}
/**
* This method does nothing in the abstract class but is meant for
* subclasses to override in order to provide functionality when the monitor
* is stopped.
*/
protected void onStop() {
}
}