/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.ambari.server.alerts; import java.lang.management.ManagementFactory; import java.lang.management.RuntimeMXBean; import java.text.MessageFormat; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeSet; import org.apache.ambari.server.orm.dao.AlertsDAO; import org.apache.ambari.server.orm.entities.AlertCurrentEntity; import org.apache.ambari.server.orm.entities.AlertDefinitionEntity; import org.apache.ambari.server.orm.entities.AlertHistoryEntity; import org.apache.ambari.server.state.Alert; import org.apache.ambari.server.state.AlertState; import org.apache.ambari.server.state.Cluster; import org.apache.ambari.server.state.MaintenanceState; import org.apache.ambari.server.state.alert.AlertDefinition; import org.apache.ambari.server.state.alert.AlertDefinitionFactory; import org.apache.ambari.server.state.alert.ParameterizedSource.AlertParameter; import org.apache.ambari.server.state.alert.ServerSource; import org.apache.ambari.server.state.alert.SourceType; import org.apache.ambari.server.state.services.AmbariServerAlertService; import org.apache.commons.lang.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.inject.Inject; /** * The {@link StaleAlertRunnable} is used by the * {@link AmbariServerAlertService} to check the last time that an alert was * checked and determine if it seems to no longer be running. It will produce a * single alert with {@link AlertState#CRITICAL} along with a textual * description of the alerts that are stale. */ public class StaleAlertRunnable extends AlertRunnable { /** * Logger. */ private final static Logger LOG = LoggerFactory.getLogger(StaleAlertRunnable.class); /** * The message for the alert when all services have run in their designated * intervals. */ private static final String ALL_ALERTS_CURRENT_MSG = "All alerts have run within their time intervals."; /** * The message to use when alerts are detected as stale. */ private static final String STALE_ALERTS_MSG = "There are {0} stale alerts from {1} host(s):\n{2}"; private static final String TIMED_LABEL_MSG = "{0} ({1})"; private static final String HOST_LABEL_MSG = "{0}\n [{1}]"; /** * Convert the minutes for the delay of an alert into milliseconds. */ private static final long MINUTE_TO_MS_CONVERSION = 60L * 1000L; private static final long MILLISECONDS_PER_MINUTE = 1000L * 60L; private static final int MINUTES_PER_DAY = 24 * 60; private static final int MINUTES_PER_HOUR = 60; /** * The multiplier for the interval of the definition which is being checked * for staleness. If this value is {@code 2}, then alerts are considered stale * if they haven't run in more than 2x their interval value. */ private static final int INTERVAL_WAIT_FACTOR_DEFAULT = 2; /** * A parameter which exposes the interval multipler to use for calculating * staleness. If this does not exist, then * {@link #INTERVAL_WAIT_FACTOR_DEFAULT} will be used. */ private static final String STALE_INTERVAL_MULTIPLIER_PARAM_KEY = "stale.interval.multiplier"; /** * Used to get the current alerts and the last time they ran. */ @Inject private AlertsDAO m_alertsDao; /** * Used for converting {@link AlertDefinitionEntity} into * {@link AlertDefinition} instances. */ @Inject private AlertDefinitionFactory m_definitionFactory; /** * Constructor. * * @param definitionName */ public StaleAlertRunnable(String definitionName) { super(definitionName); } /** * {@inheritDoc} */ @Override List<Alert> execute(Cluster cluster, AlertDefinitionEntity myDefinition) { // get the multiplier int waitFactor = getWaitFactorMultiplier(myDefinition); // use the uptime of the Ambari Server as a way to determine if we need to // give the alert more time to report in RuntimeMXBean rb = ManagementFactory.getRuntimeMXBean(); long uptime = rb.getUptime(); int totalStaleAlerts = 0; Set<String> staleAlertGroupings = new TreeSet<>(); Map<String, Set<String>> staleAlertsByHost = new HashMap<>(); Set<String> hostsWithStaleAlerts = new TreeSet<>(); // get the cluster's current alerts List<AlertCurrentEntity> currentAlerts = m_alertsDao.findCurrentByCluster( cluster.getClusterId()); long now = System.currentTimeMillis(); // for each current alert, check to see if the last time it ran is // more than INTERVAL_WAIT_FACTOR * its interval value (indicating it hasn't // run) for (AlertCurrentEntity current : currentAlerts) { AlertHistoryEntity history = current.getAlertHistory(); AlertDefinitionEntity currentDefinition = history.getAlertDefinition(); // skip aggregates as they are special if (currentDefinition.getSourceType() == SourceType.AGGREGATE) { continue; } // skip alerts in maintenance mode if (current.getMaintenanceState() != MaintenanceState.OFF) { continue; } // skip alerts that have not run yet if (current.getLatestTimestamp() == 0) { continue; } // skip this alert (who watches the watchers) if (currentDefinition.getDefinitionName().equals(m_definitionName)) { continue; } // convert minutes to milliseconds for the definition's interval long intervalInMillis = currentDefinition.getScheduleInterval() * MINUTE_TO_MS_CONVERSION; // if the server hasn't been up long enough to consider this alert stale, // then don't mark it stale - this is to protect against cases where // Ambari was down for a while and after startup it hasn't received the // alert because it has a longer interval than this stale alert check: // // Stale alert check - every 5 minutes // Foo alert cehck - every 10 minutes // Ambari down for 35 minutes for upgrade if (uptime <= waitFactor * intervalInMillis) { continue; } // if the last time it was run is >= INTERVAL_WAIT_FACTOR * the interval, // it's stale long timeDifference = now - current.getLatestTimestamp(); if (timeDifference >= waitFactor * intervalInMillis) { // increase the count totalStaleAlerts++; // it is technically possible to have a null/blank label; if so, // default to the name of the definition String label = currentDefinition.getLabel(); if (StringUtils.isEmpty(label)) { label = currentDefinition.getDefinitionName(); } if (null != history.getHostName()) { // keep track of the host, if not null String hostName = history.getHostName(); hostsWithStaleAlerts.add(hostName); if (!staleAlertsByHost.containsKey(hostName)) { staleAlertsByHost.put(hostName, new TreeSet<String>()); } staleAlertsByHost.get(hostName).add(MessageFormat.format(TIMED_LABEL_MSG, label, millisToHumanReadableStr(timeDifference))); } else { // non host alerts staleAlertGroupings.add(label); } } } for (String host : staleAlertsByHost.keySet()) { staleAlertGroupings.add(MessageFormat.format(HOST_LABEL_MSG, host, StringUtils.join(staleAlertsByHost.get(host), ",\n "))); } AlertState alertState = AlertState.OK; String alertText = ALL_ALERTS_CURRENT_MSG; // if there are stale alerts, mark as CRITICAL with the list of // alerts if (!staleAlertGroupings.isEmpty()) { alertState = AlertState.CRITICAL; alertText = MessageFormat.format(STALE_ALERTS_MSG, totalStaleAlerts, hostsWithStaleAlerts.size(), StringUtils.join(staleAlertGroupings, ",\n")); } Alert alert = new Alert(myDefinition.getDefinitionName(), null, myDefinition.getServiceName(), myDefinition.getComponentName(), null, alertState); alert.setLabel(myDefinition.getLabel()); alert.setText(alertText); alert.setTimestamp(now); alert.setCluster(cluster.getClusterName()); return Collections.singletonList(alert); } /** * Converts given {@code milliseconds} to human-readable {@link String} like "1d 2h 3m" or "2h 4m". * @param milliseconds milliseconds to convert * @return human-readable string */ private static String millisToHumanReadableStr(long milliseconds){ int min, hour, days; min = (int)(milliseconds / MILLISECONDS_PER_MINUTE); days = min / MINUTES_PER_DAY; min = min % MINUTES_PER_DAY; hour = min / MINUTES_PER_HOUR; min = min % MINUTES_PER_HOUR; String result = ""; if(days > 0) { result += days + "d "; } if(hour > 0) { result += hour + "h "; } if(min > 0) { result += min + "m "; } return result.trim(); } /** * Gets the wait factor multiplier off of the definition, returning * {@link #INTERVAL_WAIT_FACTOR_DEFAULT} if not specified. This will look for * {@link #STALE_INTERVAL_MULTIPLIER_PARAM_KEY} in the definition parameters. * The value returned from this method will be guaranteed to be in the range * of 2 to 10. * * @param entity * the definition to read * @return the wait factor interval multiplier */ private int getWaitFactorMultiplier(AlertDefinitionEntity entity) { // start with the default int waitFactor = INTERVAL_WAIT_FACTOR_DEFAULT; // coerce the entity into a business object so that the list of parameters // can be extracted and used for threshold calculation try { AlertDefinition definition = m_definitionFactory.coerce(entity); ServerSource serverSource = (ServerSource) definition.getSource(); List<AlertParameter> parameters = serverSource.getParameters(); for (AlertParameter parameter : parameters) { Object value = parameter.getValue(); if (StringUtils.equals(parameter.getName(), STALE_INTERVAL_MULTIPLIER_PARAM_KEY)) { waitFactor = getThresholdValue(value, INTERVAL_WAIT_FACTOR_DEFAULT); } } if (waitFactor < 2 || waitFactor > 10) { LOG.warn( "The interval multipler of {} is outside the valid range for {} and will be set to 2", waitFactor, entity.getLabel()); waitFactor = 2; } } catch (Exception exception) { LOG.error("Unable to read the {} parameter for {}", STALE_INTERVAL_MULTIPLIER_PARAM_KEY, StaleAlertRunnable.class.getSimpleName(), exception); } return waitFactor; } }