/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.ambari.server.events.listeners.alerts;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.concurrent.locks.Lock;
import org.apache.ambari.server.AmbariException;
import org.apache.ambari.server.EagerSingleton;
import org.apache.ambari.server.configuration.Configuration;
import org.apache.ambari.server.controller.MaintenanceStateHelper;
import org.apache.ambari.server.controller.RootServiceResponseFactory.Components;
import org.apache.ambari.server.controller.RootServiceResponseFactory.Services;
import org.apache.ambari.server.events.AlertEvent;
import org.apache.ambari.server.events.AlertReceivedEvent;
import org.apache.ambari.server.events.AlertStateChangeEvent;
import org.apache.ambari.server.events.InitialAlertEvent;
import org.apache.ambari.server.events.publishers.AlertEventPublisher;
import org.apache.ambari.server.orm.RequiresSession;
import org.apache.ambari.server.orm.dao.AlertDefinitionDAO;
import org.apache.ambari.server.orm.dao.AlertsDAO;
import org.apache.ambari.server.orm.entities.AlertCurrentEntity;
import org.apache.ambari.server.orm.entities.AlertDefinitionEntity;
import org.apache.ambari.server.orm.entities.AlertHistoryEntity;
import org.apache.ambari.server.state.Alert;
import org.apache.ambari.server.state.AlertFirmness;
import org.apache.ambari.server.state.AlertState;
import org.apache.ambari.server.state.Cluster;
import org.apache.ambari.server.state.Clusters;
import org.apache.ambari.server.state.ConfigHelper;
import org.apache.ambari.server.state.MaintenanceState;
import org.apache.ambari.server.state.alert.SourceType;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.math.NumberUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.eventbus.AllowConcurrentEvents;
import com.google.common.eventbus.Subscribe;
import com.google.common.util.concurrent.Striped;
import com.google.inject.Inject;
import com.google.inject.Provider;
import com.google.inject.Singleton;
import com.google.inject.persist.Transactional;
/**
* The {@link AlertReceivedListener} class handles {@link AlertReceivedEvent}
* and updates the appropriate DAOs. It may also fire new
* {@link AlertStateChangeEvent} when an {@link AlertState} change is detected.
*/
@Singleton
@EagerSingleton
public class AlertReceivedListener {
/**
* Logger.
*/
private static final Logger LOG = LoggerFactory.getLogger(AlertReceivedListener.class);
@Inject
Configuration m_configuration;
@Inject
AlertsDAO m_alertsDao;
@Inject
AlertDefinitionDAO m_definitionDao;
/**
* Used for looking up whether an alert has a valid service/component/host
*/
@Inject
Provider<Clusters> m_clusters;
/**
* Used to calculate the maintenance state of new alerts being created.
* Consider the case where you have disabled alerts for a component in MM.
* This means that there are no current alerts in the system since disabling
* them removes all current instances. New alerts being created for the
* component in MM must reflect the correct MM.
*/
@Inject
private Provider<MaintenanceStateHelper> m_maintenanceStateHelper;
/**
* Receives and publishes {@link AlertEvent} instances.
*/
private AlertEventPublisher m_alertEventPublisher;
/**
* Used for ensuring that creation of {@link AlertCurrentEntity} instances has fine-grain
* locks to prevent duplicates.
*/
private Striped<Lock> creationLocks = Striped.lazyWeakLock(100);
/**
* Constructor.
*
* @param publisher
*/
@Inject
public AlertReceivedListener(AlertEventPublisher publisher) {
m_alertEventPublisher = publisher;
m_alertEventPublisher.register(this);
}
/**
* Adds an alert. Checks for a new state before creating a new history record.
*
* @param event
* the event to handle.
*/
@Subscribe
@AllowConcurrentEvents
@RequiresSession
public void onAlertEvent(AlertReceivedEvent event) {
if (LOG.isDebugEnabled()) {
LOG.debug(event.toString());
}
// process the list of alerts inside of a single transaction to prevent too
// many transactions/commits
List<Alert> alerts = event.getAlerts();
// these can be wrapped in their own transaction
List<AlertCurrentEntity> toMerge = new ArrayList<>();
List<AlertCurrentEntity> toCreateHistoryAndMerge = new ArrayList<>();
List<AlertEvent> alertEvents = new ArrayList<>(20);
for (Alert alert : alerts) {
// jobs that were running when a service/component/host was changed
// which invalidate the alert should not be reported
if (!isValid(alert)) {
continue;
}
String clusterName = alert.getCluster();
Long clusterId = getClusterIdByName(clusterName);
if (clusterId == null) {
// check event
clusterId = event.getClusterId();
}
AlertDefinitionEntity definition = m_definitionDao.findByName(clusterId,
alert.getName());
if (null == definition) {
LOG.warn(
"Received an alert for {} which is a definition that does not exist anymore",
alert.getName());
continue;
}
// it's possible that a definition which is disabled will still have a
// running alert returned; this will ensure we don't record it
if (!definition.getEnabled()) {
LOG.debug(
"Received an alert for {} which is disabled. No more alerts should be received for this definition.",
alert.getName());
continue;
}
AlertCurrentEntity current;
AlertState alertState = alert.getState();
// attempt to lookup the current alert
current = getCurrentEntity(clusterId, alert, definition);
// if it doesn't exist then we must create it, ensuring that two or more
// aren't created from other threads
if( null == current ){
// if there is no current alert and the state is skipped, then simply
// skip over this one as there is nothing to update in the databse
if (alertState == AlertState.SKIPPED) {
continue;
}
// create a key out of the cluster/definition name/host (possibly null)
int key = Objects.hash(clusterId, alert.getName(), alert.getHostName());
Lock lock = creationLocks.get(key);
lock.lock();
// attempt to lookup the current alert again to ensure that a previous
// thread didn't already create it
try {
// if it's not null anymore, then there's no work to do here
current = getCurrentEntity(clusterId, alert, definition);
if( null != current ) {
continue;
}
// the current alert is still null, so go through and create it
AlertHistoryEntity history = createHistory(clusterId, definition, alert);
// this new alert must reflect the correct MM state for the
// service/component/host
MaintenanceState maintenanceState = MaintenanceState.OFF;
try {
maintenanceState = m_maintenanceStateHelper.get().getEffectiveState(clusterId, alert);
} catch (Exception exception) {
LOG.error("Unable to determine the maintenance mode state for {}, defaulting to OFF",
alert, exception);
}
current = new AlertCurrentEntity();
current.setMaintenanceState(maintenanceState);
current.setAlertHistory(history);
current.setLatestTimestamp(alert.getTimestamp());
current.setOriginalTimestamp(alert.getTimestamp());
// brand new alert instances being received are always HARD
current.setFirmness(AlertFirmness.HARD);
m_alertsDao.create(current);
// create the event to fire later
alertEvents.add(new InitialAlertEvent(clusterId, alert, current));
} finally {
// release the lock for this alert
lock.unlock();
}
} else if (alertState == current.getAlertHistory().getAlertState()
|| alertState == AlertState.SKIPPED) {
// update the timestamp no matter what
current.setLatestTimestamp(alert.getTimestamp());
// only update some fields if the alert isn't SKIPPED
if (alertState != AlertState.SKIPPED) {
current.setLatestText(alert.getText());
// ++ the occurrences (should be safe enough since we should ever only
// be handling unique alert events concurrently
long occurrences = current.getOccurrences() + 1;
current.setOccurrences(occurrences);
// ensure that if we've met the repeat tolerance and the alert is
// still SOFT, then we transition it to HARD - we also need to fire an
// event
AlertFirmness firmness = current.getFirmness();
int repeatTolerance = getRepeatTolerance(definition, clusterName);
if (firmness == AlertFirmness.SOFT && occurrences >= repeatTolerance) {
current.setFirmness(AlertFirmness.HARD);
// create the event to fire later
AlertStateChangeEvent stateChangedEvent = new AlertStateChangeEvent(clusterId, alert,
current, alertState, firmness);
alertEvents.add(stateChangedEvent);
}
}
// some special cases for SKIPPED alerts
if (alertState == AlertState.SKIPPED) {
// set the text on a SKIPPED alert IFF it's not blank; a blank text
// field means that the alert doesn't want to change the existing text
String alertText = alert.getText();
if (StringUtils.isNotBlank(alertText)) {
current.setLatestText(alertText);
}
}
// store the entity for merging later
toMerge.add(current);
} else {
if (LOG.isDebugEnabled()) {
LOG.debug(
"Alert State Changed: CurrentId {}, CurrentTimestamp {}, HistoryId {}, HistoryState {}",
current.getAlertId(), current.getLatestTimestamp(),
current.getAlertHistory().getAlertId(),
current.getAlertHistory().getAlertState());
}
AlertHistoryEntity oldHistory = current.getAlertHistory();
AlertState oldState = oldHistory.getAlertState();
AlertFirmness oldFirmness = current.getFirmness();
// insert history, update current
AlertHistoryEntity history = createHistory(clusterId,
oldHistory.getAlertDefinition(), alert);
current.setLatestTimestamp(alert.getTimestamp());
current.setOriginalTimestamp(alert.getTimestamp());
current.setLatestText(alert.getText());
current.setAlertHistory(history);
// figure out how to set the occurrences correctly
switch (alertState) {
// an OK state always resets, regardless of what the old one was
case OK:
current.setOccurrences(1);
break;
case CRITICAL:
case SKIPPED:
case UNKNOWN:
case WARNING:
// OK -> non-OK is a reset
if (oldState == AlertState.OK) {
current.setOccurrences(1);
} else {
// non-OK -> non-OK is a continuation
current.setOccurrences(current.getOccurrences() + 1);
}
break;
default:
break;
}
// set the firmness of the new alert state based on the state, type,
// occurrences, and repeat tolerance
AlertFirmness firmness = calculateFirmnessForStateChange(clusterName, definition,
alertState, current.getOccurrences());
current.setFirmness(firmness);
// store the entity for merging later
toCreateHistoryAndMerge.add(current);
// create the event to fire later
alertEvents.add(new AlertStateChangeEvent(clusterId, alert, current, oldState, oldFirmness));
}
}
// invokes the EntityManager create/merge on various entities in a single
// transaction
saveEntities(toMerge, toCreateHistoryAndMerge);
// broadcast events
for (AlertEvent eventToFire : alertEvents) {
m_alertEventPublisher.publish(eventToFire);
}
}
/**
* Gets the cluster ID given a name.
*
* @param clusterName
* @return
*/
private Long getClusterIdByName(String clusterName) {
try {
return m_clusters.get().getCluster(clusterName).getClusterId();
} catch (AmbariException e) {
LOG.warn("Cluster lookup failed for cluster named {}", clusterName);
return null;
}
}
/**
* Gets the {@link AlertCurrentEntity} which cooresponds to the new alert being received, if any.
*
* @param clusterId the ID of the cluster.
* @param alert the alert being received (not {@code null}).
* @param definition the {@link AlertDefinitionEntity} for the alert being received (not {@code null}).
* @return the existing current alert or {@code null} for none.
*/
private AlertCurrentEntity getCurrentEntity(long clusterId, Alert alert, AlertDefinitionEntity definition){
if (StringUtils.isBlank(alert.getHostName()) || definition.isHostIgnored()) {
return m_alertsDao.findCurrentByNameNoHost(clusterId, alert.getName());
} else {
return m_alertsDao.findCurrentByHostAndName(clusterId, alert.getHostName(),
alert.getName());
}
}
/**
* Saves alert and alert history entities in single transaction
* @param toMerge - merge alert only
* @param toCreateHistoryAndMerge - create new history, merge alert
*/
@Transactional
void saveEntities(List<AlertCurrentEntity> toMerge,
List<AlertCurrentEntity> toCreateHistoryAndMerge) {
for (AlertCurrentEntity entity : toMerge) {
m_alertsDao.merge(entity, m_configuration.isAlertCacheEnabled());
}
for (AlertCurrentEntity entity : toCreateHistoryAndMerge) {
m_alertsDao.create(entity.getAlertHistory());
m_alertsDao.merge(entity);
if (LOG.isDebugEnabled()) {
LOG.debug(
"Alert State Merged: CurrentId {}, CurrentTimestamp {}, HistoryId {}, HistoryState {}",
entity.getAlertId(), entity.getLatestTimestamp(),
entity.getAlertHistory().getAlertId(),
entity.getAlertHistory().getAlertState());
}
}
}
/**
* Gets whether the specified alert is valid for its reported cluster,
* service, component, and host. This method is necessary for the following
* cases
* <ul>
* <li>A service/component is removed, but an alert queued for reporting is
* received after that event.</li>
* <li>A host is removed from the cluster but the agent is still running and
* reporting</li>
* <li>A cluster is renamed</li>
* </ul>
*
* @param alert
* the alert.
* @return {@code true} if the alert is for a valid combination of
* cluster/service/component/host.
*/
private boolean isValid(Alert alert) {
String clusterName = alert.getCluster();
String serviceName = alert.getService();
String componentName = alert.getComponent();
String hostName = alert.getHostName();
// AMBARI/AMBARI_SERVER is always a valid service/component combination
String ambariServiceName = Services.AMBARI.name();
String ambariServerComponentName = Components.AMBARI_SERVER.name();
String ambariAgentComponentName = Components.AMBARI_AGENT.name();
if (ambariServiceName.equals(serviceName) && ambariServerComponentName.equals(componentName)) {
return true;
}
// if the alert is not bound to a cluster, then it's most likely a
// host alert and is always valid as long as the host exists
if (StringUtils.isBlank(clusterName)) {
// no cluster, no host; return true out of respect for the unknown alert
if (StringUtils.isBlank(hostName)) {
return true;
}
// if a host is reported, it must be registered to some cluster somewhere
if (!m_clusters.get().hostExists(hostName)) {
LOG.error("Unable to process alert {} for an invalid host named {}",
alert.getName(), hostName);
return false;
}
// no cluster, valid host; return true
return true;
}
// at this point the following criteria is guaranteed, so get the cluster
// - a cluster exists
// - this is not for AMBARI_SERVER component
final Cluster cluster;
try {
cluster = m_clusters.get().getCluster(clusterName);
if (null == cluster) {
LOG.error("Unable to process alert {} for an invalid cluster named {}",
alert.getName(), clusterName);
return false;
}
} catch (AmbariException ambariException) {
if (LOG.isDebugEnabled()) {
LOG.debug("Unable to process alert {} for an invalid cluster named {}",
alert.getName(), clusterName, ambariException);
} else {
LOG.error("Unable to process alert {} for an invalid cluster named {}",
alert.getName(), clusterName);
}
return false;
}
// at this point the following criteria is guaranteed
// - a cluster exists
// - this is not for AMBARI_SERVER component
//
// if the alert is for AMBARI/AMBARI_AGENT, then it's valid IFF
// the agent's host is still a part of the reported cluster
if (ambariServiceName.equals(serviceName) && ambariAgentComponentName.equals(componentName)) {
// agents MUST report a hostname
if (StringUtils.isBlank(hostName) || !m_clusters.get().hostExists(hostName)
|| !m_clusters.get().isHostMappedToCluster(clusterName, hostName)) {
LOG.warn(
"Unable to process alert {} for cluster {} and host {} because the host is not a part of the cluster.",
alert.getName(), clusterName, hostName);
return false;
}
// AMBARI/AMBARI_AGENT and valid host; return true
return true;
}
// at this point the following criteria is guaranteed
// - a cluster exists
// - not for the AMBARI service
if (StringUtils.isNotBlank(hostName)) {
// if valid hostname
if (!m_clusters.get().hostExists(hostName)) {
LOG.warn("Unable to process alert {} for an invalid host named {}",
alert.getName(), hostName);
return false;
}
if (!cluster.getServices().containsKey(serviceName)) {
LOG.warn("Unable to process alert {} for an invalid service named {}",
alert.getName(), serviceName);
return false;
}
// if the alert is for a host/component then verify that the component
// is actually installed on that host
if (null != componentName &&
!cluster.getHosts(serviceName, componentName).contains(hostName)) {
LOG.warn(
"Unable to process alert {} for an invalid service {} and component {} on host {}",
alert.getName(), serviceName, componentName, hostName);
return false;
}
}
return true;
}
/**
* Convenience method to create a new historical alert.
*
* @param clusterId
* the cluster id
* @param definition
* the definition
* @param alert
* the alert data
* @return the new history record
*/
private AlertHistoryEntity createHistory(long clusterId,
AlertDefinitionEntity definition, Alert alert) {
AlertHistoryEntity history = new AlertHistoryEntity();
history.setAlertDefinition(definition);
history.setAlertDefinitionId(definition.getDefinitionId());
history.setAlertLabel(definition.getLabel());
history.setAlertInstance(alert.getInstance());
history.setAlertState(alert.getState());
history.setAlertText(alert.getText());
history.setAlertTimestamp(Long.valueOf(alert.getTimestamp()));
history.setClusterId(Long.valueOf(clusterId));
history.setComponentName(alert.getComponent());
history.setServiceName(alert.getService());
// only set a host for the history item if the alert definition says to
if (definition.isHostIgnored()) {
history.setHostName(null);
} else {
history.setHostName(alert.getHostName());
}
return history;
}
/**
* Gets the firmness for an {@link AlertCurrentEntity}. The following rules
* apply:
* <ul>
* <li>If an alert is {@link AlertState#OK}, then the firmness is always
* {@link AlertFirmness#HARD}.</li>
* <li>If an alert is {@link SourceType#AGGREGATE}, then the firmness is
* always {@link AlertFirmness#HARD}.</li>
* <li>Otherwise, the firmness will be {@link AlertFirmness#SOFT} unless the
* repeat tolerance has been met.</li>
* </ul>
*
* @param definition
* the definition to read any repeat tolerance overrides from.
* @param state
* the state of the {@link AlertCurrentEntity}.
* @param occurrences
* occurrences of the alert in the current state (used for
* calculation firmness when moving between non-OK states)
* @return
*/
private AlertFirmness calculateFirmnessForStateChange(String clusterName, AlertDefinitionEntity definition,
AlertState state, long occurrences) {
// OK is always HARD since the alert has fulfilled the conditions
if (state == AlertState.OK) {
return AlertFirmness.HARD;
}
// aggregate alerts are always HARD since they only react to HARD alerts
if (definition.getSourceType() == SourceType.AGGREGATE) {
return AlertFirmness.HARD;
}
int tolerance = getRepeatTolerance(definition, clusterName);
if (tolerance <= 1) {
return AlertFirmness.HARD;
}
if (tolerance <= occurrences) {
return AlertFirmness.HARD;
}
return AlertFirmness.SOFT;
}
/**
* Gets the repeat tolerance value for the specified definition. This method
* will return the override from the definition if
* {@link AlertDefinitionEntity#isRepeatToleranceEnabled()} is {@code true}.
* Otherwise, it uses {@link ConfigHelper#CLUSTER_ENV_ALERT_REPEAT_TOLERANCE},
* defaulting to {@code 1} if not found.
*
* @param definition
* the definition (not {@code null}).
* @param clusterName
* the name of the cluster (not {@code null}).
* @return the repeat tolerance for the alert
*/
private int getRepeatTolerance(AlertDefinitionEntity definition, String clusterName) {
// if the definition overrides the global value, then use that
if (definition.isRepeatToleranceEnabled()) {
return definition.getRepeatTolerance();
}
int repeatTolerance = 1;
try {
Cluster cluster = m_clusters.get().getCluster(clusterName);
String value = cluster.getClusterProperty(ConfigHelper.CLUSTER_ENV_ALERT_REPEAT_TOLERANCE, "1");
repeatTolerance = NumberUtils.toInt(value, 1);
} catch (AmbariException ambariException) {
LOG.warn("Unable to read {}/{} from cluster {}, defaulting to 1", ConfigHelper.CLUSTER_ENV,
ConfigHelper.CLUSTER_ENV_ALERT_REPEAT_TOLERANCE, clusterName, ambariException);
}
return repeatTolerance;
}
}