/* * NOTE: This copyright does *not* cover user programs that use HQ * program services by normal system calls through the application * program interfaces provided as part of the Hyperic Plug-in Development * Kit or the Hyperic Client Development Kit - this is merely considered * normal use of the program, and does *not* fall under the heading of * "derived work". * * Copyright (C) [2004-2012], VMware, Inc. * This file is part of HQ. * * HQ is free software; you can redistribute it and/or modify * it under the terms version 2 of the GNU General Public License as * published by the Free Software Foundation. This program is distributed * in the hope that it will be useful, but WITHOUT ANY WARRANTY; without * even the implied warranty of MERCHANTABILITY or FITNESS FOR A * PARTICULAR PURPOSE. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 * USA. */ package org.hyperic.hq.measurement.agent.server; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Properties; import java.util.Random; import java.util.Set; import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.RejectedExecutionException; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.ScheduledFuture; import java.util.concurrent.ThreadFactory; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicLong; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.hyperic.hq.agent.diagnostics.AgentDiagnosticObject; import org.hyperic.hq.agent.diagnostics.AgentDiagnostics; import org.hyperic.hq.agent.server.AgentStartException; import org.hyperic.hq.agent.server.monitor.AgentMonitorException; import org.hyperic.hq.agent.server.monitor.AgentMonitorSimple; import org.hyperic.hq.agent.stats.AgentStatsCollector; import org.hyperic.hq.appdef.shared.AppdefEntityID; import org.hyperic.hq.measurement.MeasurementConstants; import org.hyperic.hq.measurement.TimingVoodoo; import org.hyperic.hq.measurement.agent.ScheduledMeasurement; import org.hyperic.hq.product.GenericPlugin; import org.hyperic.hq.product.MeasurementValueGetter; import org.hyperic.hq.product.Metric; import org.hyperic.hq.product.MetricInvalidException; import org.hyperic.hq.product.MetricNotFoundException; import org.hyperic.hq.product.MetricUnreachableException; import org.hyperic.hq.product.MetricValue; import org.hyperic.hq.product.PluginException; import org.hyperic.hq.product.PluginNotFoundException; import org.hyperic.hq.util.properties.PropertiesUtil; import org.hyperic.util.TimeUtil; import org.hyperic.util.collection.IntHashMap; import org.hyperic.util.schedule.EmptyScheduleException; import org.hyperic.util.schedule.Schedule; import org.hyperic.util.schedule.ScheduleException; import org.hyperic.util.schedule.ScheduledItem; import org.hyperic.util.schedule.UnscheduledItemException; /** * The schedule thread which maintains the schedule, and dispatches on them. * After data is retrieved, it is sent to the SenderThread which handles * depositing the results on disk, and sending them to the bizapp. */ public class ScheduleThread extends AgentMonitorSimple implements Runnable, AgentDiagnosticObject { private static final String SCHEDULE_THREAD_METRICS_COLLECTED_TIME = AgentStatsCollector.SCHEDULE_THREAD_METRICS_COLLECTED_TIME; private static final String SCHEDULE_THREAD_METRIC_TASKS_SUBMITTED = AgentStatsCollector.SCHEDULE_THREAD_METRIC_TASKS_SUBMITTED; private static final String SCHEDULE_THREAD_METRIC_COLLECT_FAILED = AgentStatsCollector.SCHEDULE_THREAD_METRIC_COLLECT_FAILED; // Agent properties configuration static final String PROP_POOLSIZE = "scheduleThread.poolsize."; // e.g. scheduleThread.poolsize.system=10 static final String PROP_FETCH_LOG_TIMEOUT = "scheduleThread.fetchLogTimeout"; static final String PROP_CANCEL_TIMEOUT = "scheduleThread.cancelTimeout"; static final String PROP_QUEUE_SIZE = "scheduleThread.queuesize."; private boolean deductServerTimeDiff = true; // How often we check schedules when we think they are empty. private static final int ONE_SECOND = 1000; private static final int POLL_PERIOD = ONE_SECOND; private static final int UNREACHABLE_EXPIRE = (60 * 1000) * 5; private static final long FETCH_TIME = 2000; // 2 seconds. private static final long CANCEL_TIME = 5000; // 5 seconds. private static final int EXECUTOR_QUEUE_SIZE = 10000; private long logFetchTimeout = FETCH_TIME; private long cancelTimeout = CANCEL_TIME; private static final Log log = LogFactory.getLog(ScheduleThread.class.getName()); // AppdefID -> Schedule private final Map<String,ResourceSchedule> schedules = new HashMap<String,ResourceSchedule>(); // Should I shut down? private final AtomicBoolean shouldDie = new AtomicBoolean(false); // Interrupt object private final Object interrupter = new Object(); // Hash of DSNs to their errors private final HashMap<String,String> errors = new HashMap<String,String>(); private final Properties agentConfig; // agent.properties // Map of Executors, one per plugin private final HashMap<String,ThreadPoolExecutor> executors = new HashMap<String,ThreadPoolExecutor>(); // Map of asynchronous MetricTasks pending confirmation private final HashMap<Future<?>,MetricTask> metricCollections = new HashMap<Future<?>,MetricTask>(); // The executor confirming metric collections, cancelling tasks that exceed // our timeouts. private final ScheduledExecutorService metricVerificationService; private final ScheduledFuture<?> metricVerificationTask; private final ScheduledFuture<?> metricLoggingTask; private final MeasurementValueGetter manager; private final Sender sender; // Guy handling the results private final Set<Integer> scheduled = new HashSet<Integer>(); // Statistics private final Object statsLock = new Object(); private long statNumMetricsFetched = 0; private long statNumMetricsFailed = 0; private long statTotFetchTime = 0; private long statNumMetricsScheduled = 0; private long statMaxFetchTime = Long.MIN_VALUE; private long statMinFetchTime = Long.MAX_VALUE; private final AgentStatsCollector statsCollector; private final Random rand = new Random(); private final int offset; private final Map<AppdefEntityID, DiagInfo> diagInfo = new HashMap<AppdefEntityID, DiagInfo>(); ScheduleThread(Sender sender, MeasurementValueGetter manager, Properties config) throws AgentStartException { this.statsCollector = AgentStatsCollector.getInstance(); this.statsCollector.register(SCHEDULE_THREAD_METRIC_COLLECT_FAILED); this.statsCollector.register(SCHEDULE_THREAD_METRIC_TASKS_SUBMITTED); this.statsCollector.register(SCHEDULE_THREAD_METRICS_COLLECTED_TIME); this.agentConfig = config; this.manager = manager; this.sender = sender; int tmp = getFudgeFactor(); if (tmp <= 0) { offset = ONE_SECOND; } else { offset = tmp; log.info("fudgeFactor is set to " + offset + " ms"); } String sLogFetchTimeout = agentConfig.getProperty(PROP_FETCH_LOG_TIMEOUT); if(sLogFetchTimeout != null){ try { logFetchTimeout = Integer.parseInt(sLogFetchTimeout); log.info("Log fetch timeout set to " + logFetchTimeout); } catch(NumberFormatException exc){ log.error("Invalid setting for " + PROP_FETCH_LOG_TIMEOUT + " value=" + sLogFetchTimeout + ", using defaults."); } } String sCancelTimeout = agentConfig.getProperty(PROP_CANCEL_TIMEOUT); if(sCancelTimeout != null){ try { cancelTimeout = Integer.parseInt(sCancelTimeout); log.info("Cancel timeout set to " + cancelTimeout); } catch(NumberFormatException exc){ log.error("Invalid setting for " + PROP_CANCEL_TIMEOUT + " value=" + sCancelTimeout + ", using defaults."); } } metricVerificationService = Executors.newSingleThreadScheduledExecutor(); metricVerificationTask = metricVerificationService.scheduleAtFixedRate(new MetricVerificationTask(), POLL_PERIOD, POLL_PERIOD, TimeUnit.MILLISECONDS); metricLoggingTask = metricVerificationService.scheduleAtFixedRate(new MetricLoggingTask(), 1, 600, TimeUnit.SECONDS); AgentDiagnostics.getInstance().addDiagnostic(this); // by default we the deduction feature is on Boolean deductServerOffset = PropertiesUtil.getBooleanValue( agentConfig.getProperty(ServerTimeDiff.PROP_DEDUCT_SERVER_TIME_DIFF), true); deductServerTimeDiff = deductServerOffset; } /** * Task for printing Executor statistics */ private class MetricLoggingTask implements Runnable { public void run() { for (String plugin : executors.keySet()) { ThreadPoolExecutor executor = executors.get(plugin); if (log.isDebugEnabled()) { log.debug("Plugin=" + plugin + ", " + "CompletedTaskCount=" + executor.getCompletedTaskCount() + ", " + "ActiveCount=" + executor.getActiveCount() + ", " + "TaskCount=" + executor.getTaskCount() + ", " + "PoolSize=" + executor.getPoolSize()); } } } } /** * The MetricVerificationTask iterates over the list of FutureTasks that * have been submitted for execution. Each task is checked for completion * and then removed. For tasks that do not complete within the timeout * are cancelled, which will attempt to free up the executor running the * task. * NOTE: This will only work if the hung task is in an interrupt-able state * i.e. sleep() or wait() */ private class MetricVerificationTask implements Runnable { public void run() { boolean isDebugEnabled = log.isDebugEnabled(); synchronized (metricCollections) { if (isDebugEnabled && (metricCollections.size() > 0)) { log.debug(metricCollections.size() + " metrics to validate."); } for (Iterator<Future<?>> i = metricCollections.keySet().iterator(); i.hasNext();) { Future<?> t = i.next(); MetricTask mt = metricCollections.get(t); if (t.isDone()) { if (isDebugEnabled) { log.debug("Metric task '" + mt + "' complete, duration: " + mt.getExecutionDuration()); } i.remove(); } else { // Not complete, check for timeout if (mt.getExecutionDuration() > cancelTimeout) { boolean res = t.cancel(true); log.error("Metric '" + mt + "' took too long to run (" + mt.getExecutionDuration() + "ms), cancelled (result=" + res + ")"); // If the metric is Availability, send a down data point in // case the metric cancellation fails. ParsedTemplate pt = getParsedTemplate(mt.meas); if (pt.metric.isAvail()) { MetricValue data = new MetricValue(MeasurementConstants.AVAIL_DOWN); sender.processData(mt.meas.getDsnID(), data, mt.meas.getDerivedID(), true); } // Task will be removed on next iteration } } } } } } private static class ResourceSchedule { private final Schedule schedule = new Schedule(); private AppdefEntityID id; private long lastUnreachble = 0; private final List<ScheduledMeasurement> retry = new ArrayList<ScheduledMeasurement>(); private final IntHashMap collected = new IntHashMap(); } private ResourceSchedule getSchedule(ScheduledMeasurement meas) { String key = meas.getEntity().getAppdefKey(); ResourceSchedule schedule; synchronized (schedules) { schedule = schedules.get(key); if (schedule == null) { schedule = new ResourceSchedule(); schedule.id = meas.getEntity(); schedules.put(key, schedule); log.debug("Created ResourceSchedule for: " + key); } } return schedule; } // TODO: I don't think this works properly, hence the slow agent shutdowns.. private void interruptMe(){ synchronized (interrupter) { interrupter.notify(); } } /** * Shut down the schedule thread. */ void die(){ shouldDie.set(true); for (String s : executors.keySet()) { ThreadPoolExecutor executor = executors.get(s); List<Runnable> queuedMetrics = executor.shutdownNow(); log.info("Shut down executor service for plugin '" + s + "'" + " with " + queuedMetrics.size() + " queued collections"); } metricLoggingTask.cancel(true); metricVerificationTask.cancel(true); List<Runnable> pending = metricVerificationService.shutdownNow(); log.info("Shutdown metric verification task with " + pending.size() + " tasks"); interruptMe(); } /** * Un-schedule a previously scheduled, repeatable measurement. * * @param ent Entity to un-schedule metrics for * * @throws UnscheduledItemException indicating the passed ID was not found */ void unscheduleMeasurements(AppdefEntityID ent) throws UnscheduledItemException { String key = ent.getAppdefKey(); ScheduledItem[] items; ResourceSchedule rs; synchronized (schedules) { rs = schedules.remove(key); } if (rs == null) { if (log.isDebugEnabled()) { log.debug("No measurement schedule for: " + key); } return; } setDiagScheduled(rs, false); items = rs.schedule.getScheduledItems(); log.debug("Un-scheduling " + items.length + " metrics for " + ent); synchronized (statsLock) { statNumMetricsScheduled -= items.length; } synchronized (scheduled) { for (ScheduledItem item : items) { ScheduledMeasurement meas = (ScheduledMeasurement) item.getObj(); scheduled.remove(meas.getDerivedID()); //For plugin/Collector awareness ParsedTemplate tmpl = getParsedTemplate(meas); if ((tmpl == null) || (tmpl.metric == null)) { continue; } tmpl.metric.setInterval(-1); } } } /** * Schedule a measurement to be taken at a given interval. * * @param meas Measurement to schedule */ void scheduleMeasurement(ScheduledMeasurement meas){ int mid = meas.getDerivedID(); synchronized (scheduled) { if (scheduled.contains(mid)) { return; } scheduled.add(mid); } ResourceSchedule rs = getSchedule(meas); setDiagScheduled(rs, true); try { rs.schedule.scheduleItem(meas, meas.getInterval(), true, true); if (log.isDebugEnabled()) { Long timeOfNext; try { timeOfNext = rs.schedule.getTimeOfNext(); } catch (EmptyScheduleException e) { timeOfNext = null; } log.debug("scheduleMeasurement timeOfNext=" + TimeUtil.toString(timeOfNext) + ", template=" + getParsedTemplate(meas).metric.toDebugString()); } synchronized (statsLock) { statNumMetricsScheduled++; } } catch (ScheduleException e) { log.error("Unable to schedule metric '" + getParsedTemplate(meas) + "', skipping. Cause is " + e, e); synchronized (scheduled) { scheduled.remove(mid); } } } private void logCache(String basicMsg, ParsedTemplate tmpl, String msg, Exception exc, boolean printStack){ String oldMsg; boolean isDebug = log.isDebugEnabled(); synchronized(errors){ oldMsg = errors.get(tmpl.metric.toString()); } if(!isDebug && (oldMsg != null) && oldMsg.equals(msg)){ return; } if(isDebug){ log.error(basicMsg + " while processing Metric '" + tmpl + "'", exc); } else { log.error(basicMsg + ": " + msg); if (printStack) { log.error("Stack trace follows:", exc); } } synchronized(errors){ errors.put(tmpl.metric.toString(), msg); } } /** * A method which does the main logging for the run() method. It * ensures that we don't perform excessive logging when plugins * generate a lot of errors. * * @param basicMsg The basic log message * @param tmpl The template causing the errors * @param exc The Exception to be logged. */ private void logCache(String basicMsg, ParsedTemplate tmpl, Exception exc){ logCache(basicMsg, tmpl, exc.getMessage(), exc, false); } private void clearLogCache(ParsedTemplate tmpl){ synchronized(errors){ errors.remove(tmpl.metric.toString()); } } static class ParsedTemplate { String plugin; Metric metric; @Override public String toString() { return plugin + ":" + metric.toDebugString(); } } static ParsedTemplate getParsedTemplate(ScheduledMeasurement meas) { ParsedTemplate tmpl = new ParsedTemplate(); String template = meas.getDSN(); //duplicating some code from MeasurementPluginManager //so we can do Metric.setId int ix = template.indexOf(":"); if (ix < 0) { return tmpl; } tmpl.plugin = template.substring(0, ix); String metric = template.substring(ix+1, template.length()); tmpl.metric = Metric.parse(metric); return tmpl; } private ParsedTemplate toParsedTemplate(ScheduledMeasurement meas) { AppdefEntityID aid = meas.getEntity(); if (aid == null) { return null; } int id = aid.getID(); int type = aid.getType(); ParsedTemplate tmpl = getParsedTemplate(meas); if ((tmpl == null) || (tmpl.metric == null)) { return null; } tmpl.metric.setId(type, id); tmpl.metric.setCategory(meas.getCategory()); tmpl.metric.setInterval(meas.getInterval()); return tmpl; } private class MetricTask implements Runnable { ResourceSchedule rs; ScheduledMeasurement meas; long executeStartTime = 0; long executeEndTime = 0; MetricTask(ResourceSchedule rs, ScheduledMeasurement meas) { this.rs = rs; this.meas = meas; } /** * @return The string representing this metric. */ @Override public String toString() { return getParsedTemplate(meas).metric.toDebugString(); } /** * @return 0 if task is still queued for execution, otherwise the * amount of time in milliseconds the task has been running. */ public long getExecutionDuration() { if (executeStartTime == 0) { // Still queued for execution return executeStartTime; } else if (executeEndTime == 0) { // Currently executing return System.currentTimeMillis() - executeStartTime; } else { // Completed return executeEndTime - executeStartTime; } } public void run() { boolean isDebug = log.isDebugEnabled(); AppdefEntityID aid = meas.getEntity(); String category = meas.getCategory(); ParsedTemplate dsn = toParsedTemplate(meas); MetricValue data = null; executeStartTime = System.currentTimeMillis(); boolean success = false; if (rs.lastUnreachble != 0) { if (!category.equals(MeasurementConstants.CAT_AVAILABILITY)) { // Prevent stacktrace bombs if a resource is // down, but don't skip processing availability metrics. statsCollector.addStat(1, SCHEDULE_THREAD_METRIC_COLLECT_FAILED); statNumMetricsFailed++; return; } } // XXX -- We should do something with the exceptions here. // Maybe send some kind of error back to the // bizapp? try { int mid = meas.getDsnID(); if (rs.collected.get(mid) == Boolean.TRUE) { if (isDebug) { log.debug("Skipping duplicate mid=" + mid + ", aid=" + rs.id); } return; } long lastCollected = meas.getLastCollected(); long now = now(); long nowRounded = TimingVoodoo.roundDownTime(now, 60000); // HHQ-5483 - agent is collecting metrics too frequently. I don't want to mess with the scheduling // algorithm at this time, so I am simply putting checks in to prevent this scenario from occurring if ((lastCollected + meas.getInterval()) > nowRounded) { if (isDebug) { log.debug("ALREADY COLLECTED meas=" + meas + " @ " + TimeUtil.toString(lastCollected)); } return; } if (isDebug) { log.debug("collecting data for meas=" + meas); } data = manager.getValue(dsn.plugin, dsn.metric); if (deductServerTimeDiff) { if (Math.abs(ServerTimeDiff.getInstance().getServerTimeDiff()) > ServerTimeDiff.MIN_OFFSET_FOR_DEDUCTION) { // deduct the server time offset from the metric // value time stamp data.setTimestamp(data.getTimestamp() + ServerTimeDiff.getInstance().getServerTimeDiff()); } } long time = TimingVoodoo.roundDownTime(now, meas.getInterval()); meas.setLastCollected(time); if (data == null) { // Don't allow plugins to return null from getValue(), // convert these to MetricValue.NONE log.warn("Plugin returned null value for metric: " + dsn); data = MetricValue.NONE; } rs.collected.put(mid, Boolean.TRUE); setDiagInfo(data, dsn, rs, mid); success = true; clearLogCache(dsn); } catch(PluginNotFoundException exc){ logCache("Plugin not found", dsn, exc); } catch(PluginException exc){ logCache("Measurement plugin error", dsn, exc); } catch(MetricInvalidException exc){ logCache("Invalid Metric requested", dsn, exc); } catch(MetricNotFoundException exc){ logCache("Metric Value not found", dsn, exc); } catch(MetricUnreachableException exc){ logCache("Metric unreachable", dsn, exc); rs.lastUnreachble = executeStartTime; log.warn("Disabling metrics for: " + rs.id); } catch(Exception exc){ // Unexpected exception logCache("Error getting measurement value", dsn, exc.toString(), exc, true); } // Stats stuff Long timeDiff = System.currentTimeMillis() - executeStartTime; statsCollector.addStat(timeDiff, SCHEDULE_THREAD_METRICS_COLLECTED_TIME); synchronized (statsLock) { statTotFetchTime += timeDiff; if(timeDiff > statMaxFetchTime) { statMaxFetchTime = timeDiff; } if(timeDiff < statMinFetchTime) { statMinFetchTime = timeDiff; } } if (timeDiff > logFetchTimeout) { log.warn("Collection of metric: '" + dsn + "' took: " + timeDiff + "ms"); } if (success) { if (isDebug) { String debugDsn = getParsedTemplate(meas).metric.toDebugString(); String msg = "[" + aid + ":" + category + "] Metric='" + debugDsn + "' -> " + data; log.debug(msg + " timestamp=" + data.getTimestamp()); } if (data.isNone()) { //wouldn't be inserted into the database anyhow //but might as well skip sending this to the server. return; } else if (data.isFuture()) { //for example, first time collecting an exec: metric //adding to this list will cause the metric to be //collected next time the schedule has items to consume //rather than waiting for the metric's own interval //which could take much longer to hit //(e.g. Windows Updates on an 8 hour interval) rs.retry.add(meas); return; } sender.processData(meas.getDsnID(), data, meas.getDerivedID(), MeasurementConstants.CAT_AVAILABILITY.equals(category)); synchronized (statsLock) { statNumMetricsFetched++; } } else { synchronized (statsLock) { statNumMetricsFailed++; } } } private long now() { return System.currentTimeMillis(); } } private void setDiagInfo(MetricValue data, ParsedTemplate dsn, ResourceSchedule rs, int mid) { final AppdefEntityID aeid = rs.id; synchronized (diagInfo) { DiagInfo tmp = diagInfo.get(aeid); if (tmp == null) { tmp = new DiagInfo(rs.id); diagInfo.put(aeid, tmp); } tmp.add(new MetricValuePlusId(mid, data)); } } private void setDiagScheduled(ResourceSchedule rs, boolean incrementSchedules) { synchronized(diagInfo) { DiagInfo tmp = diagInfo.get(rs.id); if (tmp == null) { tmp = new DiagInfo(rs.id); diagInfo.put(rs.id, tmp); } if (incrementSchedules) { tmp.incrementSchedules(); } else { tmp.incrementUnSchedules(); } } } private int getQueueSize(String plugin) { String prop = PROP_QUEUE_SIZE + plugin; String sQueueSize = agentConfig.getProperty(prop); if(sQueueSize != null){ try { return Integer.parseInt(sQueueSize); } catch(NumberFormatException exc){ log.error("Invalid setting for " + prop + " value=" + sQueueSize + " using defaults."); } } return EXECUTOR_QUEUE_SIZE; } private int getPoolSize(String plugin) { String prop = PROP_POOLSIZE + plugin; String sPoolSize = agentConfig.getProperty(prop); if(sPoolSize != null){ try { return Integer.parseInt(sPoolSize); } catch(NumberFormatException exc){ log.error("Invalid setting for " + prop + " value=" + sPoolSize + " using defaults."); } } return 1; } private void collect(ResourceSchedule rs, List<ScheduledMeasurement> items) { final boolean debug = log.isDebugEnabled(); for (int i=0; (i<items.size()) && (!shouldDie.get()); i++) { ScheduledMeasurement meas = items.get(i); ParsedTemplate tmpl = toParsedTemplate(meas); if (tmpl == null) { log.warn("template for meas id=" + meas.getDerivedID() + " is null"); continue; } ThreadPoolExecutor executor; String plugin; synchronized (executors) { try { GenericPlugin p = manager.getPlugin(tmpl.plugin).getProductPlugin(); plugin = p.getName(); } catch (PluginNotFoundException e) { if (debug) { log.debug("Could not find plugin name from template '" + tmpl.plugin + "'. Associated plugin might not be initialized yet."); } continue; } executor = executors.get(plugin); if (executor == null) { final int poolSize = getPoolSize(plugin); final int queueSize = getQueueSize(plugin); log.info("Creating executor for plugin '" + plugin + "' with a poolsize=" + poolSize + " queuesize=" + queueSize); final ThreadFactory factory = getFactory(plugin); executor = new ThreadPoolExecutor(poolSize, poolSize, 60, TimeUnit.SECONDS, new LinkedBlockingQueue<Runnable>(queueSize), factory, new ThreadPoolExecutor.AbortPolicy()); executors.put(plugin, executor); } } MetricTask metricTask = new MetricTask(rs, meas); statsCollector.addStat(1, SCHEDULE_THREAD_METRIC_TASKS_SUBMITTED); try { Future<?> task = executor.submit(metricTask); synchronized (metricCollections) { metricCollections.put(task,metricTask); } } catch (RejectedExecutionException e) { log.warn("Executor[" + plugin + "] rejected metric task " + metricTask); statNumMetricsFailed++; } } } private ThreadFactory getFactory(final String plugin) { final SecurityManager s = System.getSecurityManager(); final ThreadGroup group = (s != null)? s.getThreadGroup() : Thread.currentThread().getThreadGroup(); return new ThreadFactory() { private final AtomicLong num = new AtomicLong(); public Thread newThread(Runnable r) { final Thread rtn = new Thread(group, r); rtn.setDaemon(true); rtn.setName(plugin + "-" + num.getAndIncrement()); return rtn; } }; } private long collect(ResourceSchedule rs) { long timeOfNext; long now = System.currentTimeMillis(); Schedule schedule = rs.schedule; try { timeOfNext = schedule.getTimeOfNext(); } catch (EmptyScheduleException e) { return POLL_PERIOD + now; } if (rs.lastUnreachble != 0) { if ((now - rs.lastUnreachble) > UNREACHABLE_EXPIRE) { rs.lastUnreachble = 0; log.info("Re-enabling metrics for: " + rs.id); } } rs.collected.clear(); if (rs.retry.size() != 0) { if (log.isDebugEnabled()) { log.debug("Retrying " + rs.retry.size() + " items (MetricValue.FUTUREs)"); } collect(rs, rs.retry); rs.retry.clear(); } if (now < timeOfNext) { return timeOfNext; } List items; try { items = schedule.consumeNextItems(); timeOfNext = schedule.getTimeOfNext(); } catch (EmptyScheduleException e) { return POLL_PERIOD + now; } collect(rs, items); return timeOfNext; } private long collect() { long timeOfNext = 0; Map<String,ResourceSchedule> schedules = null; synchronized (this.schedules) { if (this.schedules.size() == 0) { //nothing scheduled timeOfNext = POLL_PERIOD + System.currentTimeMillis(); } else { schedules = new HashMap<String,ResourceSchedule>(this.schedules); } } if (schedules != null) { for (Iterator<ResourceSchedule> it = schedules.values().iterator(); it.hasNext() && (!shouldDie.get());) { ResourceSchedule rs = it.next(); try { long next = collect(rs); if (timeOfNext == 0) { timeOfNext = next; } else { timeOfNext = Math.min(next, timeOfNext); } } catch (Throwable e) { log.error(e.getMessage(), e); } } } return timeOfNext; } /** * The main loop of the ScheduleThread, which watches the schedule * waits the appropriate time, and executes scheduled operations. */ public void run(){ final boolean isDebug = log.isDebugEnabled(); final int fudgeFactor = getFudgeFactor(); while (!shouldDie.get()) { long timeOfNext = collect(); if (fudgeFactor > 0) { timeOfNext += rand.nextInt(fudgeFactor); } long now = System.currentTimeMillis(); if (timeOfNext > now) { long wait = timeOfNext - now; if (isDebug) { log.debug("Waiting " + wait + " ms until " + TimeUtil.toString(now+wait)); } try { synchronized (interrupter) { interrupter.wait(wait); } } catch (InterruptedException e) { log.debug("Schedule thread kicked"); } } } log.info("Schedule thread shut down"); } /** * HQ-3904 Fudge factor is only for scale environments. DO NOT USE IN PRODUCTION! */ private int getFudgeFactor() { final String val = agentConfig.getProperty("agent.dsl.fudge", "0"); try { return Integer.parseInt(val); } catch (NumberFormatException e) { log.debug("val=" + val + " is not a valid number. Fudge factor will be 0 seconds"); return 0; } } // MONITOR METHODS /** * @return Get the number of metrics in the schedule */ public double getNumMetricsScheduled() throws AgentMonitorException { synchronized (statsLock) { return statNumMetricsScheduled; } } /** * @return The number of metrics which were attempted to be fetched (failed or successful) */ public double getNumMetricsFetched() throws AgentMonitorException { synchronized (statsLock) { return statNumMetricsFetched; } } /** * @return Get the number of metrics which resulted in an error when collected */ public double getNumMetricsFailed() throws AgentMonitorException { synchronized (statsLock) { return statNumMetricsFailed; } } /** * @return The total time spent fetching metrics */ public double getTotFetchTime() throws AgentMonitorException { synchronized (statsLock) { return statTotFetchTime; } } /** * @return The maximum time spent fetching a metric */ public double getMaxFetchTime() throws AgentMonitorException { synchronized (statsLock) { if(statMaxFetchTime == Long.MIN_VALUE) { return MetricValue.VALUE_NONE; } return statMaxFetchTime; } } /** * @return The minimum time spent fetching a metric */ public double getMinFetchTime() throws AgentMonitorException { synchronized (statsLock) { if(statMinFetchTime == Long.MAX_VALUE) { return MetricValue.VALUE_NONE; } return statMinFetchTime; } } public String getDiagStatus() { StringBuilder rtn = new StringBuilder(); synchronized (diagInfo) { for (Entry<AppdefEntityID, DiagInfo> entry : diagInfo.entrySet()) { AppdefEntityID aeid = entry.getKey(); DiagInfo d = entry.getValue(); rtn.append(aeid).append(":").append(d).append("\n"); d.clear(); } } return rtn.toString(); } private static final SimpleDateFormat diagInfoTimeFormat = new SimpleDateFormat("HH:mm"); private class DiagInfo { @SuppressWarnings("unused") private final AppdefEntityID aeid; private final List<MetricValuePlusId> collected = new ArrayList<MetricValuePlusId>(); private int numSchedules = 0; private int numUnSchedules = 0; private DiagInfo(AppdefEntityID aeid) { this.aeid = aeid; } private void add(MetricValuePlusId data) { collected.add(data); } private void clear() { collected.clear(); numUnSchedules = 0; numSchedules = 0; } private void incrementUnSchedules() { numUnSchedules++; } private void incrementSchedules() { numSchedules++; } @Override public String toString() { final StringBuilder rtn = new StringBuilder(); rtn.append("(").append(numSchedules).append("-").append(numUnSchedules).append(")"); for (MetricValuePlusId m : collected) { rtn.append("(").append(m.mid).append("-").append(diagInfoTimeFormat.format(new Date(m.getTimestamp()))) .append("=").append(m.getValue()).append(")"); } return rtn.toString(); } } private class MetricValuePlusId { private final int mid; private final MetricValue value; private MetricValuePlusId(int mid, MetricValue value) { this.mid = mid; this.value = value; } public double getValue() { return value.getValue(); } public long getTimestamp() { return value.getTimestamp(); } } public String getDiagName() { return "Schedule Thread Diagnostics"; } }