/** * NOTE: This copyright does *not* cover user programs that use HQ * program services by normal system calls through the application * program interfaces provided as part of the Hyperic Plug-in Development * Kit or the Hyperic Client Development Kit - this is merely considered * normal use of the program, and does *not* fall under the heading of * "derived work". * * Copyright (C) [2009-2011], VMware, Inc. * This file is part of HQ. * * HQ is free software; you can redistribute it and/or modify * it under the terms version 2 of the GNU General Public License as * published by the Free Software Foundation. This program is distributed * in the hope that it will be useful, but WITHOUT ANY WARRANTY; without * even the implied warranty of MERCHANTABILITY or FITNESS FOR A * PARTICULAR PURPOSE. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 * USA. */ package org.hyperic.hq.appdef.server.session; import java.util.Collections; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.TreeSet; import java.util.concurrent.atomic.AtomicBoolean; import javax.annotation.PostConstruct; import javax.annotation.PreDestroy; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.hyperic.hq.appdef.shared.AgentManager; import org.hyperic.hq.authz.server.session.AuthzSubject; import org.hyperic.hq.authz.shared.AuthzSubjectManager; import org.hyperic.hq.common.shared.TransactionRetry; import org.hyperic.hq.context.Bootstrap; import org.hyperic.hq.measurement.MeasurementConstants; import org.hyperic.hq.product.shared.PluginManager; import org.hyperic.hq.stats.ConcurrentStatsCollector; import org.hyperic.util.TimeUtil; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Value; import org.springframework.scheduling.TaskScheduler; import org.springframework.stereotype.Component; @Component public class AgentPluginSyncRestartThrottle { private static final Log log = LogFactory.getLog(AgentPluginSyncRestartThrottle.class); private static final int MAX_CONCURRENT_RESTARTS = 20; private static final long RECORD_TIMEOUT = 10 * MeasurementConstants.MINUTE; private static final long RESTART_PAUSE_TIME = 60000; /** * agentId to timestamp of agent reboot attempt time * if agent does not check in by RECORD_TIMEOUT then the record is expired */ private final Map<Integer, Long> agentRestartTimestampMap = new HashMap<Integer, Long>(); /** agentIds */ private final TreeSet<Integer> pendingRestarts = new TreeSet<Integer>(); /** [HHQ-4882] - agents need to be up for at least 60 secs before being restarted */ private final HashMap<Integer, Long> lastCheckin = new HashMap<Integer, Long>(); private final TaskScheduler taskScheduler; private final AtomicBoolean shutdown = new AtomicBoolean(false); private final Object LOCK = new Object(); private final AuthzSubject overlord; private final ConcurrentStatsCollector concurrentStatsCollector; private final TransactionRetry transactionRetry; @Autowired public AgentPluginSyncRestartThrottle(AuthzSubjectManager authzSubjectManager, ConcurrentStatsCollector concurrentStatsCollector, TransactionRetry transactionRetry, @Value("#{scheduler}")TaskScheduler taskScheduler) { this.overlord = authzSubjectManager.getOverlordPojo(); this.concurrentStatsCollector = concurrentStatsCollector; this.transactionRetry = transactionRetry; this.taskScheduler = taskScheduler; } @PostConstruct public void initialize() { concurrentStatsCollector.register(ConcurrentStatsCollector.AGENT_PLUGIN_SYNC_RESTARTS); concurrentStatsCollector.register(ConcurrentStatsCollector.AGENT_PLUGIN_SYNC_PENDING_RESTARTS); startThrottlerThread(); taskScheduler.scheduleWithFixedDelay(new Runnable() { public void run() { try { final boolean debug = log.isDebugEnabled(); if (debug) log.debug("starting PluginSyncRestartInvalidator"); final Runnable runner = new Runnable() { public void run() { getNumRecords(true); } }; transactionRetry.runTransaction(runner, 3, 1000); if (debug) log.debug("done PluginSyncRestartInvalidator"); } catch (Throwable t) { log.error("ERROR running PluginSyncRestartInvalidator: " + t,t); } } }, new Date(System.currentTimeMillis() + RECORD_TIMEOUT), RECORD_TIMEOUT); } public Set<Integer> getQueuedAgentIds() { synchronized (LOCK) { return new HashSet<Integer>(pendingRestarts); } } /** * @return {@link Map} of {@link Integer} agentId to {@link Long} time (ms) which represents * the last time the agentId checked in after a restart */ public Map<Integer, Long> getLastCheckinInfo() { synchronized (LOCK) { return new HashMap<Integer, Long>(lastCheckin); } } /** * @return {@link Map} of {@link Integer} agentId to {@link Long} time (ms) which represents * when a restart was initiated on the agentId */ public Map<Integer, Long> getAgentIdsInRestartState() { synchronized (LOCK) { return new HashMap<Integer, Long>(agentRestartTimestampMap); } } private void startThrottlerThread() { taskScheduler.schedule(new Runnable() { public void run() { final AgentManager agentManager = Bootstrap.getBean(AgentManager.class); while (!shutdown.get()) { try { Set<Integer> toRestart; synchronized (LOCK) { // wait 10 secs LOCK.wait(10000); toRestart = getAgentsToRestart(); } final long now = now(); for (final Integer agentId : toRestart) { try { // restart agents out of the LOCK since it blocks while // communicating to them agentManager.restartAgent(overlord, agentId); } catch (Exception e) { log.error(e,e); } finally { synchronized (LOCK) { agentRestartTimestampMap.put(agentId, now); } } } if (!toRestart.isEmpty()) { concurrentStatsCollector.addStat( toRestart.size(), ConcurrentStatsCollector.AGENT_PLUGIN_SYNC_RESTARTS); } } catch (Throwable t) { log.error(t,t); } } } }, new Date(System.currentTimeMillis() + 5000)); } private Set<Integer> getAgentsToRestart() { final Set<Integer> rtn = new HashSet<Integer>(); synchronized (LOCK) { final int numRestarts = getNumRecords(false); if (pendingRestarts.isEmpty()) { return Collections.emptySet(); } if (numRestarts >= MAX_CONCURRENT_RESTARTS) { return Collections.emptySet(); } final int max = MAX_CONCURRENT_RESTARTS - numRestarts; int i=0; for (i=0; i<max; i++) { Integer agentId = pendingRestarts.pollFirst(); if (agentId == null) { break; } if (!canRestart(agentId)) { pendingRestarts.add(agentId); continue; } rtn.add(agentId); } return rtn; } } private boolean canRestart(Integer agentId) { synchronized (LOCK) { final Long restartTime = agentRestartTimestampMap.get(agentId); if (restartTime != null) { // Agent is currently restarting return false; } final Long last = lastCheckin.get(agentId); final long now = now(); if (log.isDebugEnabled()) { log.debug("agentId=" + agentId + " lastCheckin=" + ((last == null) ? null : TimeUtil.toString(last)) + ", minRestartTime=" + ((last == null) ? TimeUtil.toString(now) : TimeUtil.toString(last+RESTART_PAUSE_TIME))); } if (last == null || now > (last + RESTART_PAUSE_TIME)) { return true; } return false; } } private long now() { return System.currentTimeMillis(); } private int getNumRecords(final boolean invalidate) { final long now = System.currentTimeMillis(); int rtn = 0; final Set<Integer> restartFailures = new HashSet<Integer>(); final boolean debug = log.isDebugEnabled(); synchronized (LOCK) { final Iterator<Entry<Integer, Long>> it=agentRestartTimestampMap.entrySet().iterator(); while (it.hasNext()) { final Entry<Integer, Long> entry = it.next(); final Integer agentId = entry.getKey(); final Long timestamp = entry.getValue(); if ((now - timestamp) >= RECORD_TIMEOUT) { if (invalidate) { restartFailures.add(agentId); it.remove(); } continue; } rtn++; } } if (invalidate) { if (!restartFailures.isEmpty()) { final PluginManager pm = Bootstrap.getBean(PluginManager.class); for (final Integer agentId : restartFailures) { if (debug) log.debug("invalidating restart status for agentId=" + agentId); pm.updateAgentPluginSyncStatus( agentId, AgentPluginStatusEnum.SYNC_IN_PROGRESS, AgentPluginStatusEnum.SYNC_FAILURE); } } } return rtn; } public void checkinAfterRestart(Integer agentId) { final boolean debug = log.isDebugEnabled(); boolean removed = false; final long now = now(); synchronized (LOCK) { removed = agentRestartTimestampMap.remove(agentId) != null; lastCheckin.put(agentId, now); } if (debug) log.debug("agentId=" + agentId + " checking in after reboot, removed = " + removed); } public void restartAgent(Integer agentId) { if (log.isDebugEnabled()) { log.debug("agentId=" + agentId + " added to list of pending reboots"); } concurrentStatsCollector.addStat(1, ConcurrentStatsCollector.AGENT_PLUGIN_SYNC_PENDING_RESTARTS); synchronized (LOCK) { pendingRestarts.add(agentId); LOCK.notifyAll(); } } @PreDestroy public void shutdown() { shutdown.set(true); synchronized(LOCK) { LOCK.notifyAll(); }//EO synchronized block } }