package org.stagemonitor.alerting; import com.codahale.metrics.Counter; import com.codahale.metrics.Gauge; import com.codahale.metrics.Histogram; import com.codahale.metrics.Meter; import com.codahale.metrics.Metric; import com.codahale.metrics.Timer; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.stagemonitor.alerting.alerter.AlertSender; import org.stagemonitor.alerting.check.Check; import org.stagemonitor.alerting.check.CheckResult; import org.stagemonitor.alerting.incident.Incident; import org.stagemonitor.alerting.incident.IncidentRepository; import org.stagemonitor.core.MeasurementSession; import org.stagemonitor.core.metrics.metrics2.Metric2Registry; import org.stagemonitor.core.metrics.metrics2.MetricName; import org.stagemonitor.core.metrics.metrics2.ScheduledMetrics2Reporter; import org.stagemonitor.core.util.JsonUtils; import java.util.Collections; import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; public class ThresholdMonitoringReporter extends ScheduledMetrics2Reporter { private static final int OPTIMISTIC_CONCURRENCY_CONTROL_RETRIES = 10; private final Logger logger = LoggerFactory.getLogger(getClass()); private final AlertSender alertSender; private final IncidentRepository incidentRepository; private final MeasurementSession measurementSession; private final AlertingPlugin alertingPlugin; public static ThresholdMonitoringReporterBuilder forRegistry(Metric2Registry registry) { return new ThresholdMonitoringReporterBuilder(registry); } public ThresholdMonitoringReporter(ThresholdMonitoringReporterBuilder builder) { super(builder); this.alertingPlugin = builder.getAlertingPlugin(); this.alertSender = builder.getAlertSender(); this.incidentRepository = builder.getIncidentRepository(); this.measurementSession = builder.getMeasurementSession(); } @Override public void reportMetrics(Map<MetricName, Gauge> gauges, Map<MetricName, Counter> counters, Map<MetricName, Histogram> histograms, Map<MetricName, Meter> meters, Map<MetricName, Timer> timers) { Map<String, Map<MetricName, Metric>> metricsGroupedByName = new HashMap<String, Map<MetricName, Metric>>(); addMetrics(metricsGroupedByName, gauges); addMetrics(metricsGroupedByName, counters); addMetrics(metricsGroupedByName, histograms); addMetrics(metricsGroupedByName, meters); addMetrics(metricsGroupedByName, timers); for (Check check : alertingPlugin.getChecks().values()) { if (measurementSession.getApplicationName().equals(check.getApplication()) && check.isActive()) { checkMetrics(metricsGroupedByName, check); } } } private <T extends Metric> void addMetrics(Map<String, Map<MetricName, Metric>> metricsGroupedByName, Map<MetricName, T > metrics) { for (Map.Entry<MetricName, T> entry : metrics.entrySet()) { Map<MetricName, Metric> metricsForName = metricsGroupedByName.get(entry.getKey().getName()); if (metricsForName == null) { metricsForName = new HashMap<MetricName, Metric>(); metricsGroupedByName.put(entry.getKey().getName(), metricsForName); } metricsForName.put(entry.getKey(), entry.getValue()); } } private void checkMetrics(Map<String, Map<MetricName, Metric>> metricsGroupedByName, Check check) { List<CheckResult> checkResults = new LinkedList<CheckResult>(); Map<MetricName, Metric> metricsOfName = metricsGroupedByName.get(check.getTarget().getName()); if (metricsOfName == null) { metricsOfName = Collections.emptyMap(); } for (Map.Entry<MetricName, Metric> entry : metricsOfName.entrySet()) { if (entry.getKey().matches(check.getTarget())) { try { checkResults.addAll(check.check(entry.getKey(), asMap(entry.getValue()))); } catch (RuntimeException e) { logger.warn(e.getMessage(), e); } } } try { addIncident(check, checkResults); } catch (RuntimeException e) { logger.warn(e.getMessage(), e); } } private Map<String, Number> asMap(Metric metric) { return JsonUtils.getMapper().convertValue(metric, Map.class); } private void addIncident(Check check, List<CheckResult> results) { Incident incident = getAndPersistIncident(check, results); if (incident != null) { alertSender.sendAlerts(check, incident); } } private Incident getAndPersistIncident(Check check, List<CheckResult> results) { boolean sucessfullyPersisted = false; Incident incident = null; for (int i = 0; i < OPTIMISTIC_CONCURRENCY_CONTROL_RETRIES && !sucessfullyPersisted; i++) { incident = getOrCreateIncident(check, results); sucessfullyPersisted = trySaveOrDeleteIncident(check, incident); } if (!sucessfullyPersisted) { logger.error("Failed to save incident {} after {} retries.", incident, OPTIMISTIC_CONCURRENCY_CONTROL_RETRIES); } return incident; } private Incident getOrCreateIncident(Check check, List<CheckResult> results) { final Incident currentIncident; Incident previousIncident = incidentRepository.getIncidentByCheckId(check.getId()); if (previousIncident == null) { if (CheckResult.getMostSevereStatus(results) == CheckResult.Status.OK) { return null; } currentIncident = new Incident(check, measurementSession, results); } else { currentIncident = new Incident(previousIncident, measurementSession, results); } return currentIncident; } private boolean trySaveOrDeleteIncident(Check check, Incident incident) { if (incident == null) { return true; } if (incident.getNewStatus() == CheckResult.Status.OK) { if (!incidentRepository.deleteIncident(incident)) { logger.warn("Optimistic lock failure when deleting incident for check group {}.", check.getId()); return false; } } else if (incident.getOldStatus() == null) { incident.setOldStatus(CheckResult.Status.OK); if (!incidentRepository.createIncident(incident)) { logger.warn("Error while creating incident for check group {}. " + "A incident for the same check group already exists.", check.getId()); return false; } } else if (!incidentRepository.updateIncident(incident)) { logger.warn("Optimistic lock failure when updating incident for check group {}.", check.getId()); return false; } return true; } public static class ThresholdMonitoringReporterBuilder extends ScheduledMetrics2Reporter.Builder<ThresholdMonitoringReporter, ThresholdMonitoringReporterBuilder> { private AlertSender alertSender; private IncidentRepository incidentRepository; private MeasurementSession measurementSession; private AlertingPlugin alertingPlugin; private ThresholdMonitoringReporterBuilder(Metric2Registry registry) { super(registry, "threshold-monitoring-reporter"); } @Override public ThresholdMonitoringReporter build() { return new ThresholdMonitoringReporter(this); } public AlertSender getAlertSender() { return alertSender; } public ThresholdMonitoringReporterBuilder alertSender(AlertSender alertSender) { this.alertSender = alertSender; return this; } public IncidentRepository getIncidentRepository() { return incidentRepository; } public ThresholdMonitoringReporterBuilder incidentRepository(IncidentRepository incidentRepository) { this.incidentRepository = incidentRepository; return this; } public MeasurementSession getMeasurementSession() { return measurementSession; } public ThresholdMonitoringReporterBuilder measurementSession(MeasurementSession measurementSession) { this.measurementSession = measurementSession; return this; } public AlertingPlugin getAlertingPlugin() { return alertingPlugin; } public ThresholdMonitoringReporterBuilder alertingPlugin(AlertingPlugin alertingPlugin) { this.alertingPlugin = alertingPlugin; return this; } } }