/** * NOTE: This copyright does *not* cover user programs that use Hyperic * program services by normal system calls through the application * program interfaces provided as part of the Hyperic Plug-in Development * Kit or the Hyperic Client Development Kit - this is merely considered * normal use of the program, and does *not* fall under the heading of * "derived work". * * Copyright (C) [2010], VMware, Inc. * This file is part of Hyperic. * * Hyperic is free software; you can redistribute it and/or modify * it under the terms version 2 of the GNU General Public License as * published by the Free Software Foundation. This program is distributed * in the hope that it will be useful, but WITHOUT ANY WARRANTY; without * even the implied warranty of MERCHANTABILITY or FITNESS FOR A * PARTICULAR PURPOSE. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 * USA. * */ package org.hyperic.hq.measurement.server.session; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import javax.annotation.PostConstruct; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.hyperic.hq.appdef.Agent; import org.hyperic.hq.appdef.server.session.AgentDAO; import org.hyperic.hq.appdef.server.session.AgentPluginSyncRestartThrottle; import org.hyperic.hq.appdef.server.session.Platform; import org.hyperic.hq.appdef.shared.AgentManager; import org.hyperic.hq.appdef.shared.AgentNotFoundException; import org.hyperic.hq.appdef.shared.AppdefUtil; import org.hyperic.hq.authz.server.session.Resource; import org.hyperic.hq.authz.shared.PermissionManager; import org.hyperic.hq.bizapp.server.session.LatherDispatcher; import org.hyperic.hq.context.Bootstrap; import org.hyperic.hq.measurement.MeasurementConstants; import org.hyperic.hq.measurement.TimingVoodoo; import org.hyperic.hq.measurement.shared.AvailabilityManager; import org.hyperic.hq.product.MetricValue; import org.hyperic.hq.stats.ConcurrentStatsCollector; import org.hyperic.util.TimeUtil; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; import org.springframework.transaction.annotation.Transactional; /** * Default implementation of {@link BackfillPointsService} Code was extracted * unmodified from AvailabilityCheckServiceImpl to allow for that class to add * data points in separate transactions * @author jhickey * */ @Transactional(readOnly = true) @Service public class BackfillPointsServiceImpl implements BackfillPointsService { private static final double AVAIL_DOWN = MeasurementConstants.AVAIL_DOWN; private static final double AVAIL_PAUSED = MeasurementConstants.AVAIL_PAUSED; private static final double AVAIL_NULL = MeasurementConstants.AVAIL_NULL; private static final long MINUTE = MeasurementConstants.MINUTE; private static final String AVAIL_BACKFILLER_NUMPLATFORMS = ConcurrentStatsCollector.AVAIL_BACKFILLER_NUMPLATFORMS; private final Log log = LogFactory.getLog(BackfillPointsServiceImpl.class); private final AvailabilityManager availabilityManager; private final PermissionManager permissionManager; private final AvailabilityCache availabilityCache; private final AgentPluginSyncRestartThrottle agentPluginSyncRestartThrottle; private final AgentDAO agentDAO; private final ConcurrentStatsCollector concurrentStatsCollector; private final AgentManager agentManager; @Autowired public BackfillPointsServiceImpl(AvailabilityManager availabilityManager, PermissionManager permissionManager, AgentPluginSyncRestartThrottle agentPluginSyncRestartThrottle, AgentDAO agentDAO, AvailabilityCache availabilityCache, ConcurrentStatsCollector concurrentStatsCollector, AgentManager agentManager) { this.availabilityManager = availabilityManager; this.permissionManager = permissionManager; this.availabilityCache = availabilityCache; this.agentPluginSyncRestartThrottle = agentPluginSyncRestartThrottle; this.agentDAO = agentDAO; this.concurrentStatsCollector = concurrentStatsCollector; this.agentManager = agentManager; } @PostConstruct public void initStats() { concurrentStatsCollector.register(AVAIL_BACKFILLER_NUMPLATFORMS); } public Map<Integer, ResourceDataPoint> getBackfillPlatformPoints(long current) { Map<Integer, ResourceDataPoint> downPlatforms = getDownPlatforms(current); log.debug("getBackfillPlatformPoints: found " + downPlatforms.size() + " downPlatforms for resource IDs: " + downPlatforms.keySet()); removeRestartingAgents(downPlatforms); log.debug("getBackfillPlatformPoints: after removeRestartingAgents: " + downPlatforms.size() + " downPlatforms for resource IDs: " + downPlatforms.keySet()); if (downPlatforms != null) { concurrentStatsCollector.addStat(downPlatforms.size(), AVAIL_BACKFILLER_NUMPLATFORMS); } return downPlatforms; } private void removeRestartingAgents(Map<Integer, ResourceDataPoint> backfillData) { if (backfillData.isEmpty()) { return; } final long now = now(); final Map<Integer, Long> restarting = agentPluginSyncRestartThrottle.getAgentIdsInRestartState(); final Set<Integer> processed = new HashSet<Integer>(); for (final Entry<Integer, Long> entry : restarting.entrySet()) { final Integer agentId = entry.getKey(); final long restartTime = entry.getValue(); processed.add(agentId); removeAssociatedPlatforms(agentId, backfillData, restartTime, true); if (backfillData.isEmpty()) { return; } } // [HHQ-4937] allow agents up to 10 minutes after they checkin to start sending availability // before marking them down final Map<Integer, Long> lastCheckins = agentPluginSyncRestartThrottle.getLastCheckinInfo(); for (final Entry<Integer, Long> entry : lastCheckins.entrySet()) { final Integer agentId = entry.getKey(); final long lastCheckin = entry.getValue(); if (((lastCheckin + (10*MINUTE)) < now) || processed.contains(agentId)) { continue; } removeAssociatedPlatforms(agentId, backfillData, lastCheckin, false); if (backfillData.isEmpty()) { return; } } } private void removeAssociatedPlatforms(int agentId, Map<Integer, ResourceDataPoint> backfillData, long timems, boolean restarting) { final boolean debug = log.isDebugEnabled(); final Agent agent = agentDAO.get(agentId); if (agent == null) { return; } final Collection<Platform> platforms = agent.getPlatforms(); for (final Platform platform : platforms) { if (debug) { if (restarting) { log.debug(new StringBuilder(64) .append("removing platformId=").append(platform.getId()) .append(" since its agentId=").append(agentId) .append(" just restarted at ").append(TimeUtil.toString(timems)) .toString()); } else { log.debug(new StringBuilder(64) .append("removing platformId=").append(platform.getId()) .append(" since its agentId=").append(agentId) .append(" is in restart state since ").append(TimeUtil.toString(timems)) .toString()); } } backfillData.remove(platform.getResource().getId()); } } private long now() { return System.currentTimeMillis(); } private Map<Integer, ResourceDataPoint> getDownPlatforms(long timeInMillis) { final boolean debug = log.isDebugEnabled(); final List<Measurement> platformResources = availabilityManager.getPlatformResources(); final long now = TimingVoodoo.roundDownTime(timeInMillis, MINUTE); final String nowTimestamp = TimeUtil.toString(now); final Map<Integer, ResourceDataPoint> rtn = new HashMap<Integer, ResourceDataPoint>(platformResources.size()); final LatherDispatcher latherDispatcher = Bootstrap.getBean(LatherDispatcher.class); synchronized (availabilityCache) { for (final Measurement meas : platformResources) { final long interval = meas.getInterval(); /** * minDowntime says that the agent must be down for a minimum of 3 minutes for a platform to be marked * down, not 2x availability interval for an availability interval of one minute (as it was in the * past). The reason is that if the SenderThread fails once to send due to a connection issue * (if the server is too busy), then it will only send out again one minute later. For an availability * interval of 1 minute, the platform will be marked down since the agent will only send again after * one full minute + latency. Setting to a minimum of 3 minutes allows the agent time to have one * more attempt after a failure */ final long minDowntime = Math.max(2*interval, 3*MINUTE); final long end = getEndWindow(now, meas); final long begin = getBeginWindow(end, meas); final DataPoint defaultPt = new DataPoint(meas.getId().intValue(), AVAIL_NULL, end); final DataPoint last = availabilityCache.get(meas.getId(), defaultPt); final long lastTimestamp = last.getTimestamp(); if (debug) { String msg = "Checking availability for " + last + ", CacheValue=(" + TimeUtil.toString(lastTimestamp) + ") vs. Now=(" + nowTimestamp + ")"; log.debug(msg); } if (begin > end) { // this represents the scenario where the measurement mtime // was modified recently and therefore we need to wait // another interval log.info("skipping measurement " + meas.getId() + ": begin=" + begin + " > end=" + end); continue; } if (!meas.isEnabled()) { final long t = TimingVoodoo.roundDownTime(now - interval, interval); final DataPoint point = new DataPoint(meas.getId(), new MetricValue(AVAIL_PAUSED, t)); Resource resource = meas.getResource(); log.info("adding resourceId=" + resource.getId() + " to list of down platforms, metric is not enabled"); rtn.put(resource.getId(), new ResourceDataPoint(resource, point)); } else if ((last.getValue() == AVAIL_DOWN) || ((now - lastTimestamp) > minDowntime)) { // HQ-1664: This is a hack: Give a 5 minute grace period for the agent and HQ // to sync up if a resource was recently part of a downtime window if ((last.getValue() == AVAIL_PAUSED) && ((now - lastTimestamp) <= (5 * 60 * 1000))) { continue; } long t = (last.getValue() != AVAIL_DOWN) ? lastTimestamp + interval : TimingVoodoo.roundDownTime(now - interval, interval); t = (last.getValue() == AVAIL_PAUSED) ? TimingVoodoo.roundDownTime(now, interval) : t; DataPoint point = new DataPoint(meas.getId(), new MetricValue(AVAIL_DOWN, t)); Resource resource = meas.getResource(); final long lastFromLather = getLastLatherConnectTime(resource, latherDispatcher); if ((lastFromLather == Long.MIN_VALUE) || ((now - lastFromLather) > minDowntime)) { rtn.put(resource.getId(), new ResourceDataPoint(resource, point)); final String msg = new StringBuilder(256) .append("Marking availability DOWN for ").append(last) .append(", CacheValue=(").append(TimeUtil.toString(lastTimestamp)) .append(", datapt=").append(last.getValue()) .append(") vs. Now=(").append(nowTimestamp).append(") vs. Lather=(") .append(TimeUtil.toString(lastFromLather)).append(")") .toString(); log.info(msg); } rtn.put(resource.getId(), new ResourceDataPoint(resource, point)); } } } if (!rtn.isEmpty()) { permissionManager.getHierarchicalAlertingManager().performSecondaryAvailabilityCheck(rtn); } return rtn; } private long getLastLatherConnectTime(Resource resource, LatherDispatcher latherDispatcher) { long rtn = Long.MIN_VALUE; try { String agentToken = agentManager.getAgent(AppdefUtil.newAppdefEntityId(resource)).getAgentToken(); rtn = latherDispatcher.getLastCommunication(agentToken); } catch (AgentNotFoundException e) { log.debug(e,e); } return rtn; } private long getBeginWindow(long end, Measurement meas) { final long interval = 0; final long wait = 5 * MINUTE; long measInterval = meas.getInterval(); // We have to get at least the measurement interval long maxInterval = Math.max(Math.max(interval, wait), measInterval); // Begin is maximum of interval or measurement create time long begin = Math.max(end - maxInterval, meas.getMtime() + measInterval); return TimingVoodoo.roundDownTime(begin, measInterval); } // End is at least more than 1 interval away private long getEndWindow(long current, Measurement meas) { return TimingVoodoo.roundDownTime((current - meas.getInterval()), meas.getInterval()); } }