/*****************************************************************************
*
* Copyright (C) Zenoss, Inc. 2010-2013, all rights reserved.
*
* This content is made available according to terms specified in
* License.zenoss under the directory where your Zenoss product is installed.
*
****************************************************************************/
package org.zenoss.zep.impl;
import com.codahale.metrics.annotation.Timed;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.ApplicationListener;
import org.zenoss.protobufs.zep.Zep;
import org.zenoss.protobufs.zep.Zep.Event;
import org.zenoss.protobufs.zep.Zep.EventSeverity;
import org.zenoss.zep.EventPublisher;
import org.zenoss.zep.UUIDGenerator;
import org.zenoss.zep.ZepException;
import org.zenoss.zep.dao.ConfigDao;
import org.zenoss.zep.dao.FlapTrackerDao;
import org.zenoss.zep.dao.impl.EventDaoUtils;
import org.zenoss.zep.events.ZepConfigUpdatedEvent;
import org.zenoss.zep.events.ZepEvent;
import org.zenoss.zep.plugins.EventPreCreateContext;
import org.zenoss.zep.plugins.EventPreCreatePlugin;
import java.util.Map;
import static java.util.Arrays.asList;
/**
* Summary:
* This plugin runs before each event is processed and detects if the
* device emitting the event is currently in a state of flapping.
* <p/>
* Glossary:
* 1. Detection - a set of events, as identified by the clear id, is
* opened and cleared several times in a given time interval.
* 2. Notification suppression - only alert once when an event flaps several times,
* this is outside the scope of this project
* 3. Flap - when a given event set as identified by the clearid goes from a
* less than error severity to an error or greater severity.
* 4. Clear Id - This is the clear fingerprint hash, it is how I tie a set of events
* together.
* <p/>
* Implementation:
* Each time an event is indexed we will add it
* to a FlapTracker that has the following properties:
* <p/>
* key: event clear id
* value: Object - last status for the clear id
* - list of timestamps for state transitions (or flaps)
* <p/>
* By the list of timestamps for state transitions we mean that every
* time we get an event with a matching clearid and a severity greater
* than SeverityThreshold and the previous severity was less than SeverityThreshold the
* timestamp will be added to that entry. In other words, depending on the
* severity of the event the event set will either be in a "good state"
* (e.g. below warning) or a "bad state" (e.g. warning or above). We need to keep
* track of each time the event set goes from "bad" to "good".
* <p/>
* If the length of the list of timestamps exceeds a configurable length in
* a given interval we can defined the events for that clear id to be
* flapping. Any timestamps that fall outside of the given interval will
* be discarded.
* When we detect that a given clear id is flapping, we will send out a
* separate event to denote that that device and component is in a state
* of flapping. There will not be a separate "unflapping" event. When an
* flapping event is sent we will clear the list of transitions. This is
* so we do not repeatedly send flapping events.
* <p/>
* Example:
* To illustrate how this works here is an example series of events
* <p/>
* <p/>
* This is assuming we need 3 "bad" to "good" state transitions to qualify as flapping. Our time interval
* is 8 seconds.
* <p/>
* ClearId: "cl1", severity: 0, timestamp: 1
* ClearId: "cl1", severity: 0, timestamp: 2
* ClearId: "cl1", severity: 4, timestamp: 3 # flap 1
* ClearId: "cl1", severity: 4, timestamp: 4
* ClearId: "cl1", severity: 4, timestamp: 5
* ClearId: "cl1", severity: 0, timestamp: 6
* ClearId: "cl1", severity: 5, timestamp: 7 # flap 2
* ClearId: "cl1", severity: 5, timestamp: 8
* ClearId: "cl1", severity: 0, timestamp: 9
* ClearId: "cl1", severity: 4, timestamp: 10 # flap 3 (alert sent)
* ClearId: "cl1", severity: 0, timestamp: 11
* ClearId: "cl1", severity: 5, timestamp: 12 # flap 4
* <p/>
* The hashmap where we keep track of the flapping would look like this after each event:
* <p/>
* Event => ClearId: "cl1", severity: 0, timestamp: 1
* HashMap => {"cl1": {lastseverity:0,
* transitions: []}}
* <p/>
* Event => ClearId: "cl1", severity: 0, timestamp: 2
* HashMap => {"cl1": {lastseverity:0,
* transitions: []}}
* <p/>
* Event => ClearId: "cl1", severity: 4, timestamp: 3 # flap 1
* HashMap => {"cl1": {lastseverity:4,
* transitions: [3]}}
* <p/>
* Event => ClearId: "cl1", severity: 4, timestamp: 4
* HashMap => {"cl1": {lastseverity:4,
* transitions: [3]}}
* <p/>
* Event => ClearId: "cl1", severity: 4, timestamp: 5
* HashMap => {"cl1": {lastseverity:4,
* transitions: [3]}}
* <p/>
* Event => ClearId: "cl1", severity: 0, timestamp: 6
* HashMap => {"cl1": {lastseverity:0,
* transitions: [3]}}
* <p/>
* Event => ClearId: "cl1", severity: 5, timestamp: 7 # flap 2
* HashMap => {"cl1": {lastseverity:5,
* transitions: [3, 7]}}
* <p/>
* Event => ClearId: "cl1", severity: 5, timestamp: 8
* HashMap => {"cl1": {lastseverity:5,
* transitions: [3, 7]}}
* <p/>
* Event => ClearId: "cl1", severity: 0, timestamp: 9
* HashMap => {"cl1": {lastseverity:0,
* transitions: [3, 7]}}
* <p/>
* Event => ClearId: "cl1", severity: 4, timestamp: 10 # flap 3 (alert sent)
* HashMap => {"cl1": {lastseverity:5,
* transitions: [3, 7, 10]}}
* <p/>
* Event => ClearId: "cl1", severity: 0, timestamp: 11
* HashMap => {"cl1": {lastseverity:0,
* transitions: []}}
* <p/>
* Event => ClearId: "cl1", severity: 5, timestamp: 12
* HashMap => {"cl1": {lastseverity:0,
* transitions: [12]}}
*/
public class EventFlappingPlugin extends EventPreCreatePlugin implements ApplicationListener<ZepEvent> {
private static final Logger logger = LoggerFactory.getLogger(EventFlappingPlugin.class);
// spring stuff
@Autowired
private FlapTrackerDao flapTrackerDao;
private Zep.ZepConfig config;
@Autowired
private EventPublisher publisher;
@Autowired
private UUIDGenerator uuidGenerator;
// these are overwritten in the config, see the loadConfig method
private boolean enabled = true;
private String eventFlappingClass = "/Status/Flapping";
private ConfigDao configDao;
public void setFlapTrackerDao(FlapTrackerDao s) {
this.flapTrackerDao = s;
}
public void setPublisher(EventPublisher pub) {
this.publisher = pub;
}
public void setConfig(ConfigDao config) {
this.configDao = config;
}
public void setUuidGenerator(UUIDGenerator uuidGenerator) {
this.uuidGenerator = uuidGenerator;
}
@Override
public void start(Map<String, String> properties) {
super.start(properties);
readConfig();
loadConfig();
}
private void readConfig() {
try {
this.config = configDao.getConfig();
} catch (Exception e) {
this.config = Zep.ZepConfig.getDefaultInstance();
logger.warn("Unable to load event flapping configuration, using defaults", e);
}
}
private void loadConfig() {
logger.info("Event flapping detection plugin loading configuration");
// update our variables from the config
enabled = config.getEnableEventFlappingDetection();
eventFlappingClass = config.getFlappingEventClass();
}
@Override
public void onApplicationEvent(ZepEvent event) {
if (event instanceof ZepConfigUpdatedEvent) {
ZepConfigUpdatedEvent configUpdatedEvent = (ZepConfigUpdatedEvent) event;
this.config = configUpdatedEvent.getConfig();
loadConfig();
}
}
/**
* This method counts the previous flaps stored in the tracker.
*
* @param tracker
* @param event
* @return count of flap instances in the configured time window
*/
protected int countFlapsForEvent(FlapTracker tracker, Event event) {
EventSeverity sev = event.getSeverity();
EventSeverity previousSeverity = tracker.getPreviousSeverity();
// if the severity hasn't changed it can't be flapping
if (sev.equals(previousSeverity)) {
return 0;
}
// check our amount of flaps
Long[] timestamps = tracker.getTimestamps();
int count = 0;
final long windowStart = System.currentTimeMillis() / 1000l - event.getFlappingIntervalSeconds();
// get all the timestamps that fall within the window
for (Long t : timestamps) {
if (t >= windowStart) {
count++;
}
}
return count;
}
/**
* Determines if the given event is a flap by looking at the severity of the previous
* event.
*
* @param event
* @param tracker
* @param sevThreshold
* @return boolean if this particular occurrence of an event is considered a flap
*/
protected boolean isEventFlap(Event event, FlapTracker tracker, EventSeverity sevThreshold) {
// make sure we are identified before saying this is a flap
String uuid = event.getActor().getElementUuid();
if (uuid == null || uuid.equals("")) {
return false;
}
EventSeverity sev = event.getSeverity();
EventSeverity previousSeverity = tracker.getPreviousSeverity();
// if the previous severity was less than the severity threshold
if (previousSeverity.getNumber() < sevThreshold.getNumber() &&
sev.getNumber() >= sevThreshold.getNumber()) {
return true;
}
return false;
}
/**
* This builds the event flapping event that is emitted when a device
* is determined to be flapping
* @param event event that triggered the flapping event
* @return Event The Flapping Event
*/
protected Event buildFlappingEvent(Event event, int flapCount) {
// create an event based off of the passed in event
// but with the event class from the config
final Event.Builder flapEvent = Event.newBuilder();
flapEvent.setUuid(this.uuidGenerator.generate().toString());
flapEvent.setCreatedTime(System.currentTimeMillis());
Zep.EventActor actor = event.getActor();
flapEvent.setActor(actor);
StringBuilder summary = new StringBuilder()
.append("Event flapping detected for ")
.append(actor.getElementIdentifier());
if (actor.hasElementSubIdentifier()) {
summary.append(":");
summary.append(actor.getElementSubIdentifier());
}
flapEvent.setSummary(summary.toString());
flapEvent.setSeverity(EventSeverity.SEVERITY_WARNING);
flapEvent.setEventClass(eventFlappingClass);
//add information from the source event
flapEvent.addAllDetails(
asList(
Zep.EventDetail.newBuilder().setName("flapcount")
.addValue(Integer.toString(flapCount)).build(),
Zep.EventDetail.newBuilder().setName("cause_uuid")
.addValue(event.getUuid()).build(),
Zep.EventDetail.newBuilder().setName("cause_severity")
.addValue(event.getSeverity().name()).build(),
Zep.EventDetail.newBuilder().setName("cause_summary")
.addValue(event.getSummary()).build(),
Zep.EventDetail.newBuilder().setName("cause_event_class")
.addValue(event.getEventClass()).build(),
Zep.EventDetail.newBuilder().setName("cause_event_key")
.addValue(event.getEventKey()).build()
)
);
flapEvent.addAllDetails(event.getDetailsList());
return flapEvent.build();
}
@Override
public Event processEvent(Event event, EventPreCreateContext context) throws ZepException {
if (enabled) {
final long startTime = System.currentTimeMillis();
// verify that the plugin is enabled
String fingerprintHash = EventDaoUtils.DEFAULT_GENERATOR.generateClearFingerprint(event);
if (fingerprintHash != null) {
detectEventFlapping(event, fingerprintHash);
}
final long endTime = System.currentTimeMillis();
logger.debug("Detected flapping in {} milliseconds", endTime - startTime);
}
return event;
}
/**
* First determines if this event is a flap and if it is we determine if we have
* flapped enough to warrant sending a flap event. This method will update the
* timestamp in the tracker as well as cull the previous timestamps and publish the
* flapping event.
*
* @param event Event we are detecting flapping on
* @param fingerprintHash clear finger print hash for this event
*/
protected void detectEventFlapping(Event event, String fingerprintHash) {
// verify that the plugin is enabled
EventSeverity sev = event.getSeverity();
EventSeverity severityThreshold = event.getFlappingSeverity();
final int flapThreshold = event.getFlappingThreshold();
final int flapWindowSeconds = event.getFlappingIntervalSeconds();
FlapTracker tracker;
try {
tracker = flapTrackerDao.getFlapTrackerByClearFingerprintHash(fingerprintHash);
} catch (ZepException e) {
logger.warn("Unable to detect event flapping", e);
return;
}
if (isEventFlap(event, tracker, severityThreshold)) {
// append to our list of flaps
tracker.addCurrentTimeStamp();
// see if we have gone above the threshold
int flapCount = countFlapsForEvent(tracker, event);
if (flapCount >= flapThreshold && flapCount % flapThreshold == 0) {
logger.info("Publishing flap event for clear {}", fingerprintHash);
Event flapEvent = buildFlappingEvent(event, flapCount);
try {
logger.debug("Publishing this event {}", flapEvent);
publisher.publishEvent(flapEvent);
} catch (ZepException e) {
logger.error("Unable to publish flap event ", e);
}
}
}
// make sure we don't persist timestamps that occurred before our window
long windowStart = System.currentTimeMillis() / 1000l - flapWindowSeconds;
tracker.discardTimestampsOlderThan(windowStart);
tracker.setPreviousSeverity(sev);
try {
flapTrackerDao.persistTracker(fingerprintHash, tracker, flapWindowSeconds);
} catch (ZepException e) {
logger.warn("Unable to detect event flapping", e);
}
}
}