/* * NOTE: This copyright does *not* cover user programs that use HQ * program services by normal system calls through the application * program interfaces provided as part of the Hyperic Plug-in Development * Kit or the Hyperic Client Development Kit - this is merely considered * normal use of the program, and does *not* fall under the heading of * "derived work". * * Copyright (C) [2004-2011], VMWare, Inc. * This file is part of HQ. * * HQ is free software; you can redistribute it and/or modify * it under the terms version 2 of the GNU General Public License as * published by the Free Software Foundation. This program is distributed * in the hope that it will be useful, but WITHOUT ANY WARRANTY; without * even the implied warranty of MERCHANTABILITY or FITNESS FOR A * PARTICULAR PURPOSE. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 * USA. */ package org.hyperic.hq.agent.server.session; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.concurrent.ScheduledThreadPoolExecutor; import java.util.concurrent.ThreadFactory; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicLong; import javax.annotation.PostConstruct; import javax.annotation.PreDestroy; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.hyperic.hq.appdef.Agent; import org.hyperic.hq.appdef.server.session.AgentPluginSyncRestartThrottle; import org.hyperic.hq.appdef.shared.AgentManager; import org.hyperic.hq.authz.server.session.AuthzSubject; import org.hyperic.hq.authz.shared.AuthzSubjectManager; import org.hyperic.hq.common.DiagnosticObject; import org.hyperic.hq.common.DiagnosticsLogger; import org.hyperic.hq.common.SystemException; import org.hyperic.hq.measurement.MeasurementConstants; import org.hyperic.hq.measurement.shared.AvailabilityManager; import org.hyperic.hq.stats.ConcurrentStatsCollector; import org.hyperic.util.stats.StatCollector; import org.hyperic.util.stats.StatUnreachableException; import org.springframework.beans.BeansException; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Value; import org.springframework.context.ApplicationContext; import org.springframework.context.ApplicationContextAware; import org.springframework.stereotype.Component; @Component public class AgentSynchronizer implements DiagnosticObject, ApplicationContextAware { private final int NUM_WORKERS; private static final long WAIT_TIME = 5 * MeasurementConstants.MINUTE; private static final int DEFAULT_NUM_WORKERS = 20; private final Log log = LogFactory.getLog(AgentSynchronizer.class.getName()); private final Set<Integer> activeAgents = Collections.synchronizedSet(new HashSet<Integer>()); private final LinkedList<StatefulAgentDataTransferJob> agentJobs = new LinkedList<StatefulAgentDataTransferJob>(); /** used mainly for diagnostics. map of job description and number of times it has run */ private final Map<String, Integer> fullDiagInfo = new HashMap<String, Integer>(); private final AtomicBoolean shutdown = new AtomicBoolean(false); private final AtomicLong executorNum = new AtomicLong(0); private final ConcurrentStatsCollector concurrentStatsCollector; private final AuthzSubject overlord; private final AgentPluginSyncRestartThrottle agentPluginSyncRestartThrottle; private ApplicationContext ctx; private ScheduledThreadPoolExecutor executor ; @Autowired public AgentSynchronizer(ConcurrentStatsCollector concurrentStatsCollector, DiagnosticsLogger diagnosticsLogger, AgentPluginSyncRestartThrottle agentPluginSyncRestartThrottle, AuthzSubjectManager authzSubjectManager) { this.concurrentStatsCollector = concurrentStatsCollector; this.overlord = authzSubjectManager.getOverlordPojo(); this.agentPluginSyncRestartThrottle = agentPluginSyncRestartThrottle; diagnosticsLogger.addDiagnosticObject(this); NUM_WORKERS = getNumWorkers(); } private int getNumWorkers() { if (DEFAULT_NUM_WORKERS > 0) { return DEFAULT_NUM_WORKERS; } else { int cpus = Runtime.getRuntime().availableProcessors(); if (cpus > 4) { return 4; } else if (cpus <= 1) { return 1; } else { return cpus; } } } public Set<Integer> getJobListByDescription(Collection<String> descriptions) { List<AgentDataTransferJob> jobs; final Set<String> descs = new HashSet<String>(descriptions); synchronized (agentJobs) { jobs = new ArrayList<AgentDataTransferJob>(agentJobs); } final Set<Integer> rtn = new HashSet<Integer>(); for (final AgentDataTransferJob job : jobs) { if (descs.contains(job.getJobDescription())) { rtn.add(job.getAgentId()); } } return rtn; } @PostConstruct void initialize() { this.executor = new ScheduledThreadPoolExecutor(NUM_WORKERS, new ThreadFactory() { private final AtomicLong i = new AtomicLong(0); public Thread newThread(Runnable r) { return new Thread(r, "AgentSynchronizer" + i.getAndIncrement()); } }); log.info("starting AgentSynchronizer with " + NUM_WORKERS + " threads"); for (int i=0; i<NUM_WORKERS; i++) { SchedulerThread worker = new SchedulerThread("AgentSynchronizer" + i, i*1000); executor.scheduleWithFixedDelay(worker, i+1, NUM_WORKERS, TimeUnit.SECONDS); } concurrentStatsCollector.register(ConcurrentStatsCollector.AGENT_SYNC_JOB_QUEUE_ADDS); concurrentStatsCollector.register(new StatCollector() { public long getVal() throws StatUnreachableException { synchronized (agentJobs) { return agentJobs.size(); } } public String getId() { return ConcurrentStatsCollector.AGENT_SYNCHRONIZER_QUEUE_SIZE; } }); } public void addAgentJob(AgentDataTransferJob agentJob) { addAgentJob(agentJob, false); } /** * @param agentJob job to execute in the background * @param isPriority - will add job to the top of the list */ public void addAgentJob(AgentDataTransferJob agentJob, boolean isPriority) { if (log.isDebugEnabled()) log.debug("adding job=" + agentJob); synchronized (agentJobs) { if (isPriority) { agentJobs.addFirst(new StatefulAgentDataTransferJob(agentJob)); } else { agentJobs.add(new StatefulAgentDataTransferJob(agentJob)); } } concurrentStatsCollector.addStat(1, ConcurrentStatsCollector.AGENT_SYNC_JOB_QUEUE_ADDS); } private class SchedulerThread implements Runnable { private final String name; private final long initialSleep; private SchedulerThread(String name, long initialSleep) { this.name = name; this.initialSleep = initialSleep; } @Override public String toString() { return name; } public synchronized void run() { try { Thread.sleep(initialSleep); } catch (InterruptedException e) { log.debug(e,e); } while (!shutdown.get()) { try { boolean hasMoreScheduleJobs = true; while (hasMoreScheduleJobs) { hasMoreScheduleJobs = syncData(name); } Thread.sleep(NUM_WORKERS*1000); } catch (Throwable t) { log.error(t, t); } } } } private boolean syncData(String name) { StatefulAgentDataTransferJob job = null; synchronized (agentJobs) { log.debug("agentJobs, number of jobs left: " + agentJobs.size() + " thread name: " + name); job = agentJobs.poll(); } if (job == null) { return false; } if (log.isDebugEnabled()) { log.debug("agentJobs, working on new job from agentJobs queue:" + getJobInfo(job) + " RuntimeId: "+ job.getRuntimeTime() +" , number of jobs left: " + agentJobs.size() + " thread name: " + name); } Integer agentId = null; try { agentId = job.getAgentId(); final boolean debug = log.isDebugEnabled(); boolean added; if (!job.canRun()) { added = false; } else { added = activeAgents.add(agentId); } if (!added) { reAddJob(job); agentId = null; // return false so that this mechanism doesn't spin out of control // allow the other thread some time to get its job done return false; } if (debug) log.debug("executing agent data transfer agentId=" + agentId + " jobdesc=" + job.getJobDescription()); executeJob(job); setDiags(job); synchronized (agentJobs) { return !agentJobs.isEmpty(); } } catch (Exception e) { throw new SystemException("Error executing " + getJobInfo(job), e); } finally { if (agentId != null) { activeAgents.remove(agentId); } } } private void executeJob(final StatefulAgentDataTransferJob job) throws InterruptedException { final String name = Thread.currentThread().getName() + "-" + executorNum.getAndIncrement(); final Thread thread = new Thread(name) { @Override public void run() { job.setLastRuntime(); if (agentIsPingable(job)) { try { job.execute(); } catch (Throwable e) { if (e instanceof InterruptedException) { log.warn("jobdesc=" + job.getJobDescription() + " was interrupted: " + e); log.debug(e,e); } else { log.error(e,e); } } return; } else { log.warn("Could not ping agent in order to run job " + getJobInfo(job)); } } }; thread.start(); thread.join(WAIT_TIME); // if the thread is alive just try to interrupt it and keep going final boolean threadIsAlive = thread.isAlive(); final boolean jobWasSuccessful = job.wasSuccessful(); final AvailabilityManager availabilityManager = ctx.getBean(AvailabilityManager.class); final boolean platformIsAvailable = availabilityManager.platformIsAvailableOrUnknown(job.getAgentId()) || isInRestartState(job.getAgentId()); if (jobWasSuccessful) { // do nothing, this is good! return; } else if (platformIsAvailable) { if (threadIsAlive) { thread.interrupt(); } job.incrementFailures(); if(log.isDebugEnabled()){ log.debug("executeJob, number of failures for execute job=" + getJobInfo(job) + " RuntimeId: "+ job.getRuntimeTime() +" " + job.getNumberOfFailures()); } if (job.discardJob()) { job.onFailure("Too many failures on agent " + job.getAgentId() + " RuntimeId: "+ job.getRuntimeTime() ); } else { reAddJob(job); if (threadIsAlive) { log.warn("AgentDataTransferJob=" + getJobInfo(job) + " has take more than " + WAIT_TIME/1000/60 + " minutes to run. The agent appears alive so therefore the job was" + " interrupted and requeued. Job threadName={" + thread.getName() + "}"); } else { log.warn("AgentDataTransferJob=" + getJobInfo(job) + " died and was not successful. The agent appears alive and" + " therefore the job was requeued. " + " Job threadName={" + thread.getName() + "}" + " RuntimeId: "+ job.getRuntimeTime()); } } } else { if (threadIsAlive) { thread.interrupt(); log.warn("AgentDataTransferJob=" + getJobInfo(job) + " has take more than " + WAIT_TIME/1000/60 + " minutes to run. Discarding job threadName={" + thread.getName() + "}"); } // Can't ping agent and platform availability is down, therefore agent must be down job.onFailure("Platform associated with agent " + job.getAgentId() + " is not available"); } } private boolean isInRestartState(int agentId) { return agentPluginSyncRestartThrottle.getAgentIdsInRestartState().containsKey(agentId); } private boolean reAddJob(StatefulAgentDataTransferJob job) { if (job.discardJob()) { return false; } synchronized (agentJobs) { agentJobs.add(job); if(log.isDebugEnabled()){ log.debug("Readd job to the queue: " + job.getJobDescription() + " RuntimeId: "+ job.getRuntimeTime() +" queue size: " +agentJobs.size()); } } return true; } private boolean agentIsPingable(AgentDataTransferJob job) { try { // XXX need to set this in the constructor final AgentManager agentManager = ctx.getBean(AgentManager.class); agentManager.pingAgent(overlord, job.getAgentId()); } catch (Exception e) { log.debug(e,e); return false; } return true; } private String getJobInfo(AgentDataTransferJob job) { final String desc = job.getJobDescription(); final String address = getAgentAddress(job.getAgentId()); return new StringBuilder(desc.length() + 32) .append("{agentId=").append(job.getAgentId()) .append(", agentAddress=").append(address) .append(", desc=").append(desc).append("}") .toString(); } private String getAgentAddress(int agentId) { final AgentManager agentManager = ctx.getBean(AgentManager.class); if (agentManager == null) { return ""; } Agent agent = agentManager.getAgent(agentId); if (agent == null) { return ""; } return agent.getAddress(); } private void setDiags(AgentDataTransferJob job) { synchronized(fullDiagInfo) { final String desc = job.getJobDescription() + ", agentId=" + job.getAgentId(); final Integer runs = fullDiagInfo.get(desc); if (runs == null) { fullDiagInfo.put(desc, 1); } else { fullDiagInfo.put(desc, (runs+1)); } } } public String getStatus() { return getStatus(fullDiagInfo); } public String getShortStatus() { return getStatus(fullDiagInfo); } private String getStatus(Map<String, Integer> diag) { Map<String, Integer> diags = null; synchronized(diag) { diags = new HashMap<String, Integer>(diag); } final StringBuilder buf = new StringBuilder(); buf.append("\nTop 10 - Agent Synchronizer Diagnostics (job desc - number of executes):\n"); final List<Entry<String, Integer>> diagList = new ArrayList<Entry<String, Integer>>(diags.entrySet()); Collections.sort(diagList, new Comparator<Entry<String, Integer>>() { public int compare(Entry<String, Integer> e1, Entry<String, Integer> e2) { if (e1 == e2) { return 0; } return e2.getValue().compareTo(e1.getValue()); } }); int i=0; for (final Entry<String, Integer> entry : diagList) { if (i++ >= 10) { break; } buf.append(" ").append(entry.getKey()).append(" - ") .append(entry.getValue()).append("\n"); } return buf.toString(); } public String getShortName() { return "agentSynchronizer"; } public String getName() { return "Agent Synchronizer"; } public void setApplicationContext(ApplicationContext applicationContext) throws BeansException { this.ctx = applicationContext; } private class StatefulAgentDataTransferJob implements AgentDataTransferJob { private static final int MAX_FAILURES = 5;//60; private static final long TIME_BTWN_RUNS = MeasurementConstants.MINUTE; private final AgentDataTransferJob job; private int numFailures = 0; private long lastRuntime = Long.MIN_VALUE; private long runtimeTime = now(); private StatefulAgentDataTransferJob(AgentDataTransferJob job) { this.job = job; } public void setLastRuntime() { lastRuntime = now(); } public int getAgentId() { return job.getAgentId(); } public String getJobDescription() { return job.getJobDescription(); } public void execute() { job.execute(); } public boolean wasSuccessful() { return job.wasSuccessful(); } private void incrementFailures() { numFailures++; } public void onFailure(String reason) { job.onFailure(reason); } private boolean discardJob() { return numFailures >= MAX_FAILURES; } private boolean canRun() { if (numFailures >= MAX_FAILURES) { return false; } if (lastRuntime != Long.MIN_VALUE && (lastRuntime+TIME_BTWN_RUNS) > now()) { return false; } return true; } public int getNumberOfFailures(){ return numFailures; } public long getRuntimeTime() { return runtimeTime; } } private long now() { return System.currentTimeMillis(); } @PreDestroy public void shutdown() { shutdown.set(true); this.executor.shutdown() ; }//EOM }