/* * RHQ Management Platform * Copyright (C) 2005-2008 Red Hat, Inc. * All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation version 2 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ package org.rhq.enterprise.server.alert.engine.internal; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.concurrent.locks.ReentrantReadWriteLock; import javax.persistence.EntityNotFoundException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.rhq.core.domain.configuration.ResourceConfigurationUpdate; import org.rhq.core.domain.event.Event; import org.rhq.core.domain.event.EventSource; import org.rhq.core.domain.measurement.Availability; import org.rhq.core.domain.measurement.MeasurementData; import org.rhq.core.domain.measurement.calltime.CallTimeData; import org.rhq.core.domain.operation.OperationHistory; import org.rhq.enterprise.server.alert.engine.AlertConditionCacheStats; import org.rhq.enterprise.server.alert.engine.model.AbstractCacheElement; import org.rhq.enterprise.server.alert.engine.model.AvailabilityDurationComposite; import org.rhq.enterprise.server.cloud.StatusManagerLocal; import org.rhq.enterprise.server.core.AgentManagerLocal; import org.rhq.enterprise.server.plugin.pc.drift.DriftChangeSetSummary; import org.rhq.enterprise.server.util.LookupUtil; /** * This singleton that contains multiple {@link AgentConditionCache}s and one {@link GlobalConditionCache}. * Each {@link AgentConditionCache} will maintain {@link AbstractCacheElement}s for data that can ONLY be * reported by an agent, and thus can be perfectly segmented on an agent-by-agent basis. On the other hand, * the {@link GlobalConditionCache} will maintain {@link AbstractCacheElement}s for data that can either be * agent-side or server-side initiated. * * This manager forms a centralized interface through which to interact with the children caches. * * @author Joseph Marques */ public final class AlertConditionCacheCoordinator { private static final Log log = LogFactory.getLog(AlertConditionCacheCoordinator.class); private static final AlertConditionCacheCoordinator instance = new AlertConditionCacheCoordinator(); /** * When processing EventReport, we may hit an event that triggers problem alert and event that triggers recovery alert within the same report. Firing alert * is asynchronous task, so if processing events too fast, problem alert is fired after we process recovery event, so we miss recovery alert at all. * To workaround that, we slow down event processing in case we hit any event that triggers alert. When such event exists, we sleep given amount of milis * to wait for alert to get fired. There is no performance impact on processing events without alert fired, and this delay is set to 500 by default. * Recommended value is from 500 to 1500milis, 500 was tested as reliable enough for low server load cases. */ private static final long ALERTED_EVENT_PROCESSING_DELAY; static { long alertedEventProcessingDelay = 500L; try { alertedEventProcessingDelay = Long.parseLong(System.getProperty("rhq.server.alerted.event.process.delay", "500")); } catch (Throwable t) { // } ALERTED_EVENT_PROCESSING_DELAY = alertedEventProcessingDelay; } public enum Cache { MeasurementDataCache(Type.Agent), // MeasurementTraitCache(Type.Agent), // CallTimeDataCache(Type.Agent), // ResourceOperationCache(Type.Global), // AvailabilityCache(Type.Global), // EventsCache(Type.Agent), // ResourceConfigurationCache(Type.Global), // DriftCache(Type.Agent), // AvailabilityDurationCache(Type.Global); // public enum Type { Global, // Agent; }; public Type type; Cache(Type type) { this.type = type; } } private volatile GlobalConditionCache globalCache; private Map<Integer, AgentConditionCache> agentCaches; private ReentrantReadWriteLock agentReadWriteLock; private AgentManagerLocal agentManager; private StatusManagerLocal statusManager; private AlertConditionCacheCoordinator() { agentManager = LookupUtil.getAgentManager(); statusManager = LookupUtil.getStatusManager(); globalCache = new GlobalConditionCache(); // create the collections ahead of time agentCaches = new HashMap<Integer, AgentConditionCache>(); agentReadWriteLock = new ReentrantReadWriteLock(); } public static AlertConditionCacheCoordinator getInstance() { return instance; } public void reloadGlobalCache() { try { // simply "forget" about the old cache, let the JVM release the memory in time log.debug("Start reloading global cache"); globalCache = new GlobalConditionCache(); log.debug("Finished reloading global cache"); } catch (Throwable t) { try { Throwable throwable = t; boolean found = false; while (throwable != null) { if (throwable instanceof EntityNotFoundException) { // we're trying to load a list of conditions at the very moment one is deleted out from under us statusManager.markGlobalCache(); log.debug("EntityNotFoundException thrown during reload, resetting status bit for retry"); found = true; break; } throwable = throwable.getCause(); } if (!found) { log.error("Error reloading global cache", t); } } catch (Throwable inner) { // again, don't let any exceptions bubble up to the calling SLSB layer log.error("Error while resetting agent status bit during failed global cache reload attempt", inner); } } } public void reloadCachesForAgent(int agentId) { AgentConditionCache agentCache = null; try { if (log.isDebugEnabled()) { log.debug("Start reloading cache for agent[id=" + agentId + "]"); } agentCache = new AgentConditionCache(agentId); if (log.isDebugEnabled()) { log.debug("Finished reloading cache for agent[id=" + agentId + "]"); } } catch (Throwable t) { try { Throwable throwable = t; boolean found = false; while (throwable != null) { if (throwable instanceof EntityNotFoundException) { // we're trying to load a list of conditions at the very moment one is deleted out from under us statusManager.updateByAgent(agentId); log.debug("EntityNotFoundException thrown during reload, resetting status bit for retry"); found = true; break; } throwable = throwable.getCause(); } if (!found) { log.error("Error reloading cache for agent[id=" + agentId + "]", t); } } catch (Throwable inner) { // again, don't let any exceptions bubble up to the calling SLSB layer log.error("Error while resetting agent status bit during failed cache reload attempt for agent[id=" + agentId + "]", inner); } } if (agentCache != null) { agentReadWriteLock.writeLock().lock(); try { // simply "forget" about the old cache, let the JVM release the memory in time agentCaches.put(agentId, agentCache); log.debug("Reloaded agent[id=" + agentId + "] cache"); } catch (Throwable t) { log.error("Error reloading cache for agent[id=" + agentId + "]", t); // don't let any exceptions bubble up to the calling SLSB layer } finally { agentReadWriteLock.writeLock().unlock(); } } } public AlertConditionCacheStats checkConditions(MeasurementData... measurementData) { if (measurementData == null || measurementData.length == 0) { return new AlertConditionCacheStats(); } MeasurementData datum = measurementData[0]; Integer agentId = getAgentId(datum); if (agentId == null) { log.error("Could not find agent for scheduleId = " + datum.getScheduleId()); return new AlertConditionCacheStats(); } AlertConditionCacheStats stats = null; AgentConditionCache agentCache = null; agentReadWriteLock.readLock().lock(); try { agentCache = agentCaches.get(agentId); } catch (Throwable t) { log.error("Error during checkConditions", t); // don't let any exceptions bubble up to the calling SLSB layer } finally { agentReadWriteLock.readLock().unlock(); } if (agentCache != null) { stats = agentCache.checkConditions(measurementData); } else { stats = new AlertConditionCacheStats(); } return stats; } public AlertConditionCacheStats checkConditions(CallTimeData... callTimeData) { if (callTimeData == null || callTimeData.length == 0) { return new AlertConditionCacheStats(); } CallTimeData datum = callTimeData[0]; Integer agentId = getAgentId(datum); if (agentId == null) { log.error("Could not find agent for scheduleId = " + datum.getScheduleId()); return new AlertConditionCacheStats(); } AlertConditionCacheStats stats = null; AgentConditionCache agentCache = null; agentReadWriteLock.readLock().lock(); try { agentCache = agentCaches.get(agentId); } catch (Throwable t) { log.error("Error during checkConditions", t); // don't let any exceptions bubble up to the calling SLSB layer } finally { agentReadWriteLock.readLock().unlock(); } if (agentCache != null) { stats = agentCache.checkConditions(callTimeData); } else { stats = new AlertConditionCacheStats(); } return stats; } public AlertConditionCacheStats checkConditions(OperationHistory operationHistory) { AlertConditionCacheStats stats = null; try { stats = globalCache.checkConditions(operationHistory); } catch (Throwable t) { log.error("Error during checkConditions", t); // don't let any exceptions bubble up to the calling SLSB layer } if (stats == null) { stats = new AlertConditionCacheStats(); } return stats; } public AlertConditionCacheStats checkConditions(ResourceConfigurationUpdate update) { AlertConditionCacheStats stats = null; try { stats = globalCache.checkConditions(update); } catch (Throwable t) { log.error("Error during checkConditions", t); // don't let any exceptions bubble up to the calling SLSB layer } if (stats == null) { stats = new AlertConditionCacheStats(); } return stats; } public AlertConditionCacheStats checkConditions(EventSource source, Event... events) { if (source == null) { return new AlertConditionCacheStats(); } Integer agentId = getAgentId(source); if (agentId == null) { log.error("Could not find agent for resourceId = " + source.getResource().getId()); return new AlertConditionCacheStats(); } AlertConditionCacheStats stats = new AlertConditionCacheStats(); List<Event> unprocessedEvents = new ArrayList(Arrays.asList(events)); // need a List that supports iterator remove while (!unprocessedEvents.isEmpty()) { AgentConditionCache agentCache = null; agentReadWriteLock.readLock().lock(); try { agentCache = agentCaches.get(agentId); } catch (Throwable t) { log.error("Error during checkConditions", t); // don't let any exceptions bubble up to the calling SLSB layer } finally { agentReadWriteLock.readLock().unlock(); } if (agentCache != null) { stats.add(agentCache.checkConditions(source, unprocessedEvents)); if (!unprocessedEvents.isEmpty()) { // delay for a brief time to allow for the matched conditions to potentially fire an alert and // activate recovery alerts, in case the remaining events match the pending recovery conditions try { Thread.sleep(ALERTED_EVENT_PROCESSING_DELAY); } catch (InterruptedException e) { // just continue as a best effort } } } else { break; } } return stats; } public AlertConditionCacheStats checkConditions(DriftChangeSetSummary driftChangeSetSummary) { if (driftChangeSetSummary == null) { return new AlertConditionCacheStats(); } Integer agentId = getAgentId(driftChangeSetSummary); if (agentId == null) { log.error("Could not find agent for resourceId = " + driftChangeSetSummary.getResourceId()); return new AlertConditionCacheStats(); } AlertConditionCacheStats stats = null; AgentConditionCache agentCache = null; agentReadWriteLock.readLock().lock(); try { agentCache = agentCaches.get(agentId); } catch (Throwable t) { log.error("Error during checkConditions", t); // don't let any exceptions bubble up to the calling SLSB layer } finally { agentReadWriteLock.readLock().unlock(); } if (agentCache != null) { stats = agentCache.checkConditions(driftChangeSetSummary); } else { stats = new AlertConditionCacheStats(); } return stats; } public AlertConditionCacheStats checkConditions(Availability... availabilities) { AlertConditionCacheStats stats = null; try { stats = globalCache.checkConditions(availabilities); } catch (Throwable t) { log.error("Error during checkConditions", t); // don't let any exceptions bubble up to the calling SLSB layer } if (stats == null) { stats = new AlertConditionCacheStats(); } return stats; } public AlertConditionCacheStats checkConditions(AvailabilityDurationComposite... composites) { AlertConditionCacheStats stats = null; try { stats = globalCache.checkConditions(composites); } catch (Throwable t) { log.error("Error during checkConditions", t); // don't let any exceptions bubble up to the calling SLSB layer } if (stats == null) { stats = new AlertConditionCacheStats(); } return stats; } private Integer getAgentId(DriftChangeSetSummary driftChangeSetSummary) { try { int resourceId = driftChangeSetSummary.getResourceId(); Integer agentId = agentManager.getAgentIdByResourceId(resourceId); return agentId; } catch (Throwable t) { log.error("Error looking up agent by DriftChangeSet", t); } return null; } private Integer getAgentId(EventSource source) { try { int resourceId = source.getResource().getId(); Integer agentId = agentManager.getAgentIdByResourceId(resourceId); return agentId; } catch (Throwable t) { log.error("Error looking up agent by EventSource", t); } return null; } private Integer getAgentId(MeasurementData datum) { try { int scheduleId = datum.getScheduleId(); Integer agentId = agentManager.getAgentIdByScheduleId(scheduleId); return agentId; } catch (Throwable t) { log.error("Error looking up agent by MeasurementData", t); } return null; } private Integer getAgentId(CallTimeData datum) { try { int scheduleId = datum.getScheduleId(); Integer agentId = agentManager.getAgentIdByScheduleId(scheduleId); return agentId; } catch (Throwable t) { log.error("Error looking up agent by CallTimeData", t); } return null; } public int getCacheSize(AlertConditionCacheCoordinator.Cache cache) { int result = 0; if (cache.type == Cache.Type.Global) { result += globalCache.getCacheSize(cache); } else if (cache.type == Cache.Type.Agent) { List<AgentConditionCache> cachesCopy = null; agentReadWriteLock.readLock().lock(); try { cachesCopy = new ArrayList<AgentConditionCache>(agentCaches.values()); } catch (Throwable t) { log.error("Error during getCacheSize", t); // don't let any exceptions bubble up to the calling SLSB layer } finally { agentReadWriteLock.readLock().unlock(); } for (AgentConditionCache agentCache : cachesCopy) { result += agentCache.getCacheSize(cache); } } else { log.error("The " + AlertConditionCacheCoordinator.class.getSimpleName() + " does not support getting the size for caches of type " + cache.type); } return result; } public Map<String, Integer> getCacheCounts() { Map<String, Integer> counts = new HashMap<String, Integer>(); for (Cache cache : Cache.values()) { counts.put(cache.name(), getCacheSize(cache)); } return counts; } }