/**
* Helios, OpenSource Monitoring
* Brought to you by the Helios Development Group
*
* Copyright 2007, Helios Development Group and individual contributors
* as indicated by the @author tags. See the copyright.txt file in the
* distribution for a full listing of individual contributors.
*
* This is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1 of
* the License, or (at your option) any later version.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this software; if not, write to the Free
* Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
* 02110-1301 USA, or see the FSF site: http://www.fsf.org.
*
*/
package org.helios.apmrouter.destination.chronicletimeseries;
import java.lang.Thread.UncaughtExceptionHandler;
import java.lang.management.ManagementFactory;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.CopyOnWriteArraySet;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;
import org.apache.log4j.Logger;
import org.helios.apmrouter.catalog.EntryStatus;
import org.helios.apmrouter.catalog.EntryStatus.EntryStatusChange;
import org.helios.apmrouter.catalog.EntryStatusChangeListener;
import org.helios.apmrouter.catalog.jdbc.h2.adapters.chronicle.ChronicleTSAdapter;
import org.helios.apmrouter.collections.ConcurrentLongSlidingWindow;
import org.helios.apmrouter.server.ServerComponentBean;
import org.helios.apmrouter.tsmodel.Tier;
import org.helios.apmrouter.tsmodel.TimeSeriesModel;
import org.helios.apmrouter.util.SystemClock;
import org.helios.apmrouter.util.SystemClock.ElapsedTime;
import org.springframework.jmx.export.annotation.ManagedAttribute;
import org.springframework.jmx.export.annotation.ManagedMetric;
import org.springframework.jmx.export.annotation.ManagedOperation;
import org.springframework.jmx.support.MetricType;
/**
* <p>Title: ChronicleTSManager</p>
* <p>Description: Configures and manages the chronicle time-series</p>
* <p>Company: Helios Development Group LLC</p>
* @author Whitehead (nwhitehead AT heliosdev DOT org)
* <p><code>org.helios.apmrouter.destination.chronicletimeseries.ChronicleTSManager</code></p>
* TODO:
* ==============================================
* Add status check scheduler
* Add worker pool
* Execute status checks across multiple threads and wait for completion
* Invoke fireEvents asynchronously
* Implement status changes in metric catalog
* Add metrics to track elapsed time and number of state changes during status checks
* Check for first/last periods in entries (which do we want ?) Oldest should be first.....
* Status check optimization: oldest period in a tier should be in the tier header
* ==============================================
* TODO: Add support for rolling up into next tiers.
* TODO: Add basic query functionality
* TODO: Fill-Ins for sticky metrics ? Physical or implied
*/
public class ChronicleTSManager extends ServerComponentBean implements UncaughtExceptionHandler, Runnable {
/** The time series model */
private final TimeSeriesModel timeSeriesModel;
/** A map of the time-series chronicle-tiers keyed by the tier name */
private final Map<String, ChronicleTier> tiers;
/** The live tier */
private final ChronicleTier liveTier;
/** The number of periods in the live tier that marks a metric stale */
protected int stalePeriods = 4;
/** The stale window length in ms. */
protected long staleWindowSize = -1L;
/** The number of periods in the live tier that marks a metric offline */
protected int offLinePeriods = 20;
/** The offLine window length in ms. */
protected long offLineWindowSize = -1L;
/** Status check timeout in ms. */
protected long statusCheckTimeout = 5000;
/** Flag indicating if a status check is running */
protected final AtomicBoolean statusCheckRunning = new AtomicBoolean(false);
/** The manager's worker thread pool */
protected ExecutorService threadPool = null;
/** The manager's period scheduler */
protected ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(1, new ThreadFactory(){
final AtomicInteger serial = new AtomicInteger(0);
@Override
public Thread newThread(Runnable r) {
Thread t = new Thread(r, "TimeSeriesScheduler#" + serial.incrementAndGet());
t.setPriority(Thread.MAX_PRIORITY);
t.setDaemon(true);
t.setUncaughtExceptionHandler(new UncaughtExceptionHandler() {
@Override
public void uncaughtException(Thread t, Throwable e) {
error("Uncaught exception in TimeSeries scheduler [", t, "]", e);
}
});
return t;
}
});
/** The number of processing threads to create */
protected final int workerThreadCount = ManagementFactory.getOperatingSystemMXBean().getAvailableProcessors();
/** A shared latch reference */
protected final AtomicReference<CountDownLatch> latch = new AtomicReference<CountDownLatch>(null);
/** A shared EntryStatusChange reference */
protected final AtomicReference<Map<EntryStatus, EntryStatusChange>> changeCollector = new AtomicReference<Map<EntryStatus, EntryStatusChange>>(null);
/** The worker tasks to execute */
protected final StatusCheckWorker[] workers;
/** A set of {@link EntryStatus} change listeners to be notified when an entry changes state */
protected final Set<EntryStatusChangeListener> statusListeners = new CopyOnWriteArraySet<EntryStatusChangeListener>();
/** Long sliding window of the elapsed times in ns. for status checks */
protected final ConcurrentLongSlidingWindow statusCheckElapsedNs = new ConcurrentLongSlidingWindow(30);
/** Long sliding window of the number of entries checked in the last status checks */
protected final ConcurrentLongSlidingWindow totalEntriesChecked = new ConcurrentLongSlidingWindow(30);
/** Long sliding window of the number of entries set to stale in the last status checks */
protected final ConcurrentLongSlidingWindow totalStaleEntries = new ConcurrentLongSlidingWindow(30);
/** Long sliding window of the number of entries set to offline in the last status checks */
protected final ConcurrentLongSlidingWindow totalOffLineEntries = new ConcurrentLongSlidingWindow(30);
/** Long sliding window of the number of exceptions in the last status checks */
protected final ConcurrentLongSlidingWindow statusExceptions = new ConcurrentLongSlidingWindow(30);
// default ts model = p=15s,t=5m
/**
* Sets the thread pool
* @param threadPool the thread pool
*/
public void setExecutorService(ExecutorService threadPool) {
this.threadPool = threadPool;
}
/**
* {@inheritDoc}
* @see java.lang.Thread.UncaughtExceptionHandler#uncaughtException(java.lang.Thread, java.lang.Throwable)
*/
@Override
public void uncaughtException(Thread t, Throwable e) {
error("Uncaught exception in TimeSeries worker [", t, "]", e);
}
/**
* Returns the number of periods in the live tier that marks a metric stale
* @return the stale period count
*/
@ManagedAttribute(description="The number of periods in the live tier that marks a metric stale")
public int getStalePeriods() {
return stalePeriods;
}
/**
* Sets the number of periods in the live tier that marks a metric stale
* @param stalePeriods the number periods in the live tier without activity to mark a metric stale
*/
@ManagedAttribute(description="The number of periods in the live tier that marks a metric stale")
public void setStalePeriods(int stalePeriods) {
this.stalePeriods = stalePeriods;
if(this.isStarted()) {
recalcStaleWindowSize();
}
}
/**
* An update of a metric's entry status in the live tier triggered by the h2 metric table trigger.
* Since this is coming from the metric table, no event is needed.
* @param metricId The id of the metric to update
* @param status The status to update to
*/
public void triggeredStatusUpdate(long metricId, EntryStatus status) {
liveTier.triggeredStatusUpdate(metricId, status);
}
/**
* Returns the number of periods in the live tier that marks a metric offline
* @return the offline period count
*/
@ManagedAttribute(description="The number of periods in the live tier that marks a metric offline")
public int getOffLinePeriods() {
return offLinePeriods;
}
/**
* Sets the number of periods in the live tier that marks a metric offline
* @param offLinePeriods the number periods in the live tier without activity to mark a metric offline
*/
@ManagedAttribute(description="The number of periods in the live tier that marks a metric offline")
public void setOffLinePeriods(int offLinePeriods) {
this.offLinePeriods = offLinePeriods;
if(this.isStarted()) {
recalcStaleWindowSize();
}
}
/**
* Returns the calculated elapsed time of the stale window in ms.
* @return the calculated elapsed time of the stale window in ms.
*/
@ManagedAttribute(description="The calculated elapsed time of the stale window in ms.")
public long getStaleWindowSize() {
return staleWindowSize;
}
/**
* Returns the calculated elapsed time of the offline window in ms.
* @return the calculated elapsed time of the offline window in ms.
*/
@ManagedAttribute(description="The calculated elapsed time of the offline window in ms.")
public long getOffLineWindowSize() {
return offLineWindowSize;
}
/**
* Returns the model definition the time series model was built with
* @return the model definition the time series model was built with
*/
@ManagedAttribute(description="The model definition the time series model was built with")
public String getTimeSeriesModel() {
return timeSeriesModel.getModelDef();
}
/**
* {@inheritDoc}
* @see org.helios.apmrouter.server.ServerComponentBean#doStart()
*/
@Override
protected void doStart() throws Exception {
recalcStaleWindowSize();
recalcOffLineWindowSize();
Thread t = new Thread() {
@Override
public void run() {
info("\n\t========================\n\tCLOSING CTS\n\t========================\n");
for(ChronicleTier ct : tiers.values()) {
info("Closing [", ct.chronicleName, "]");
ct.close();
}
info("\n\t========================\n\tCLOSED CTS\n\t========================\n");
}
};
t.setDaemon(false);
t.setPriority(Thread.MAX_PRIORITY);
Runtime.getRuntime().addShutdownHook(t);
ChronicleTSAdapter.setCts(this);
scheduler.scheduleWithFixedDelay(new Runnable(){
@Override
public void run() {
if(!statusCheckRunning.compareAndSet(false, true)) {
warn("StatusCheck already running when scheduled fired");
} else {
runStatusCheck();
}
}
}, liveTier.getPeriodDuration(), liveTier.getPeriodDuration(), TimeUnit.SECONDS);
threadPool.execute(new Runnable() {
@Override
public void run() {
statusCheckRunning.set(true);
runStatusCheck();
}
});
}
/**
* Runs an entry status check on the live tier
*/
@ManagedOperation(description="Runs an entry status check on the live tier")
public void statusCheck() {
threadPool.execute(new Runnable() {
@Override
public void run() {
statusCheckRunning.set(true);
runStatusCheck();
}
});
}
/**
* Returns the live tier period duration in seconds
* @return the live tier period duration in seconds
*/
@ManagedAttribute(description="The live tier period duration in seconds")
public long getLiveTierPeriodDuration() {
return liveTier.getPeriodDuration();
}
/**
* {@inheritDoc}
* @see java.lang.Runnable#run()
*/
@Override
public void run() {
}
/**
* Scans the live tier for stale and off line metrics
*/
protected void runStatusCheck() {
debug("TimeSeries Live Tier Status Check Started");
try {
CountDownLatch cdl = new CountDownLatch(workerThreadCount);
Map<EntryStatus, EntryStatusChange> changeMap = EntryStatusChange.getChangeMap(SystemClock.time());
changeCollector.set(changeMap);
latch.set(cdl);
for(StatusCheckWorker worker: workers) {
threadPool.execute(worker);
}
SystemClock.startTimer();
try {
if(cdl.await(statusCheckTimeout, TimeUnit.MILLISECONDS)) {
long totalUpdates = 0;
long totalStales = changeMap.get(EntryStatus.STALE).getMetricIds().size();
long totalOffLines = changeMap.get(EntryStatus.OFFLINE).getMetricIds().size();
long totalExceptions = 0;
for(StatusCheckWorker worker: workers) {
totalUpdates += worker.totalUpdates;
totalExceptions += worker.totalInvalidIndexes;
totalExceptions += worker.totalExceptions;
}
totalEntriesChecked.insert(totalUpdates);
totalStaleEntries.insert(totalStales);
totalOffLineEntries.insert(totalOffLines);
statusExceptions.insert(totalExceptions);
if((totalStales + totalOffLines)>0) {
fireEventStatusChangeEvent(changeMap);
}
ElapsedTime et = SystemClock.endTimer();
statusCheckElapsedNs.insert(et.elapsedNs);
debug("Status check complete in ", et);
} else {
error("Scheduler thread timed out after [", statusCheckTimeout, "] ms waiting for status check");
}
} catch (Exception ex) {
error("Scheduler thread interrupted while waiting for status check", ex);
}
} finally {
statusCheckRunning.set(false);
}
}
private class StatusCheckWorker implements Runnable {
protected final ChronicleTier tier;
protected final int indexMod;
protected final int workers;
protected final Logger log;
protected long totalUpdates = 0;
protected long totalInvalidIndexes = 0;
protected long totalExceptions = 0;
/**
* Creates a new StatusCheckWorker
* @param tier The tier that will be checked
* @param indexMod The index mod that this worker handles
* @param workers The total number of workers
*/
public StatusCheckWorker(ChronicleTier tier, int indexMod, int workers) {
super();
log = Logger.getLogger(getClass().getName() + ".#" + indexMod );
this.tier = tier;
this.indexMod = indexMod;
this.workers = workers;
}
/**
* {@inheritDoc}
* @see java.lang.Runnable#run()
*/
@Override
public void run() {
totalUpdates = 0;
totalInvalidIndexes = 0;
totalExceptions = 0;
final Map<EntryStatus, EntryStatusChange> changeMap = changeCollector.get();
try {
final long now = SystemClock.time();
for(long index = 0; index < tier.getSize(); index++) {
if(index%workers == indexMod) {
try {
EntryStatus status = tier.statusCheck(index, now, staleWindowSize, offLineWindowSize);
totalUpdates++;
if(status!=null) {
changeMap.get(status).addMetricIds(index);
}
} catch (InvalidIndexExcetpion iie) {
totalInvalidIndexes++;
} catch (Exception ex) {
totalExceptions++;
}
}
}
} finally {
latch.get().countDown();
}
}
}
/**
* Creates a new ChronicleTSManager
* @param timeSeriesConfig The string representation of the time series configuration
*/
public ChronicleTSManager(String timeSeriesConfig) {
timeSeriesModel = TimeSeriesModel.create(timeSeriesConfig);
tiers = new HashMap<String, ChronicleTier>(timeSeriesModel.getTierCount());
List<Tier[]> _tiers = timeSeriesModel.getModelTierPairs();
for(int i = _tiers.size()-1; i >= 0; i--) {
Tier[] tierPair = _tiers.get(i);
ChronicleTier cTier = new ChronicleTier(tierPair[0], tierPair[1]==null ? null : getTier(tierPair[1].getName()), this);
tiers.put(tierPair[0].getName(), cTier);
}
liveTier = tiers.get("live");
if(liveTier==null) throw new IllegalStateException("There was no live tier", new Throwable());
workers = new StatusCheckWorker[workerThreadCount];
for(int i = 0; i < workerThreadCount; i++) {
workers[i] = new StatusCheckWorker(liveTier, i, workerThreadCount);
}
}
/**
* Recalculates and sets the stale window size in ms,
*/
protected void recalcStaleWindowSize() {
staleWindowSize = TimeUnit.MILLISECONDS.convert(liveTier.getPeriodDuration() * stalePeriods, TimeUnit.SECONDS);
}
/**
* Recalculates and sets the offline window size in ms,
*/
protected void recalcOffLineWindowSize() {
offLineWindowSize = TimeUnit.MILLISECONDS.convert(liveTier.getPeriodDuration() * offLinePeriods, TimeUnit.SECONDS);
}
/**
* Fires a status change event to all registered listeners
* @param changeMap the change map with all the status changes
*/
protected void fireEventStatusChangeEvent(final Map<EntryStatus, EntryStatusChange> changeMap) {
threadPool.execute(new Runnable() {
@Override
public void run() {
for(EntryStatusChangeListener listener: statusListeners) {
listener.onEntryStatusChange(changeMap);
}
}
});
}
/**
* Returns the live tier
* @return the live tier
*/
public ChronicleTier getLiveTier() {
return liveTier;
}
/**
* Returns the named chronicle tier
* @param name The name of the tier to retrieve
* @return a chronicle tier
* @throws IllegalArgumentException thrown if the name is null, empty or does not map to a ChronicleTier
*/
public ChronicleTier getTier(String name) {
if(name==null || name.trim().isEmpty()) throw new IllegalArgumentException("The passed tier name was null or empty", new Throwable());
ChronicleTier ct = tiers.get(name.trim());
if(ct==null) throw new IllegalArgumentException("The passed tier name [" + name + "] was invalid", new Throwable());
return ct;
}
/**
* Registers a new {@link EntryStatusChangeListener}.
* @param statusListener the listener to register
*/
public void addStatusListener(EntryStatusChangeListener statusListener) {
if(statusListener!=null) {
statusListeners.add(statusListener);
}
}
/**
* Removes a registered {@link EntryStatusChangeListener}.
* @param statusListener the listener to remove
*/
public void removeStatusListener(EntryStatusChangeListener statusListener) {
if(statusListener!=null) {
statusListeners.remove(statusListener);
}
}
/**
* Returns the status check timeout in ms.
* @return the status check timeout in ms.
*/
@ManagedAttribute(description="The status check timeout in ms.")
public long getStatusCheckTimeout() {
return statusCheckTimeout;
}
/**
* Sets the status check timeout in ms.
* @param statusCheckTimeout the status check timeout in ms.
*/
@ManagedAttribute(description="The status check timeout in ms.")
public void setStatusCheckTimeout(long statusCheckTimeout) {
this.statusCheckTimeout = statusCheckTimeout;
}
/**
* Returns the elapsed time of the most recent entry status check in ns.
* @return the elapsed time of the most recent entry status check in ns.
*/
@ManagedMetric(category="ChronicleTimeSeries", displayName="LastStatusCheckTimeNs", metricType=MetricType.GAUGE, description="The elapsed time of the most recent entry status check in ns.")
public long getLastStatusCheckTimeNs() {
return statusCheckElapsedNs.isEmpty() ? -1L : statusCheckElapsedNs.get(0);
}
/**
* Returns the rolling average elapsed time of the last 100 status checks in ns.
* @return the rolling average elapsed time of the last 100 status checks in ns.
*/
@ManagedMetric(category="ChronicleTimeSeries", displayName="AverageStatusCheckTimeNs", metricType=MetricType.GAUGE, description="The rolling average elapsed time of the last 100 status checks in ns.")
public long getAverageStatusCheckTimeNs() {
return statusCheckElapsedNs.isEmpty() ? -1L : statusCheckElapsedNs.avg();
}
/**
* Returns the elapsed time of the most recent entry status check in ms.
* @return the elapsed time of the most recent entry status check in ms.
*/
@ManagedMetric(category="ChronicleTimeSeries", displayName="LastStatusCheckTimeMs", metricType=MetricType.GAUGE, description="The elapsed time of the most recent entry status check in ms.")
public long getLastStatusCheckTimeMs() {
return TimeUnit.MILLISECONDS.convert(getLastStatusCheckTimeNs(), TimeUnit.NANOSECONDS);
}
/**
* Returns the rolling average elapsed time of the last 100 status checks in ms.
* @return the rolling average elapsed time of the last 100 status checks in ms.
*/
@ManagedMetric(category="ChronicleTimeSeries", displayName="AverageStatusCheckTimeMs", metricType=MetricType.GAUGE, description="The rolling average elapsed time of the last 100 status checks in Ms.")
public long getAverageStatusCheckTimeMs() {
return TimeUnit.MILLISECONDS.convert(getAverageStatusCheckTimeNs(), TimeUnit.NANOSECONDS);
}
/**
* Returns the number of entries checked in the last status check
* @return the number of entries checked in the last status check
*/
@ManagedMetric(category="ChronicleTimeSeries", displayName="LastEntriesChecked", metricType=MetricType.GAUGE, description="The the number of entries checked in the last status check")
public long getLastEntriesChecked() {
return totalEntriesChecked.isEmpty() ? -1L : totalEntriesChecked.get(0);
}
/**
* Returns the rolling average of entries checked in the last 30 status checks
* @return the rolling average of entries checked in the last 30 status checks
*/
@ManagedMetric(category="ChronicleTimeSeries", displayName="AverageEntriesChecked", metricType=MetricType.GAUGE, description="The the rolling average of entries checked in the last 30 status checks")
public long getAverageEntriesChecked() {
return totalEntriesChecked.isEmpty() ? -1L : totalEntriesChecked.avg();
}
/**
* Returns the number of stale entries in the last status check
* @return the number of stale entries in the last status check
*/
@ManagedMetric(category="ChronicleTimeSeries", displayName="LastStaleEntries", metricType=MetricType.GAUGE, description="The the number of stale entries in the last status check")
public long getLastStaleEntries() {
return totalStaleEntries.isEmpty() ? -1L : totalStaleEntries.get(0);
}
/**
* Returns the rolling average of stale entries in the last 30 status checks
* @return the rolling average of stale entries in the last 30 status checks
*/
@ManagedMetric(category="ChronicleTimeSeries", displayName="AverageStaleEntries", metricType=MetricType.GAUGE, description="The the rolling average of stale entries in the last 30 status checks")
public long getAverageStaleEntries() {
return totalStaleEntries.isEmpty() ? -1L : totalStaleEntries.avg();
}
/**
* Returns the number of offline entries in the last status check
* @return the number of offline entries in the last status check
*/
@ManagedMetric(category="ChronicleTimeSeries", displayName="LastOffLineEntries", metricType=MetricType.GAUGE, description="The the number of offline entries in the last status check")
public long getLastOffLineEntries() {
return totalOffLineEntries.isEmpty() ? -1L : totalOffLineEntries.get(0);
}
/**
* Returns the rolling average of offline entries in the last 30 status checks
* @return the rolling average of offline entries in the last 30 status checks
*/
@ManagedMetric(category="ChronicleTimeSeries", displayName="AverageOffLineEntries", metricType=MetricType.GAUGE, description="The the rolling average of offline entries in the last 30 status checks")
public long getAverageOffLineEntries() {
return totalOffLineEntries.isEmpty() ? -1L : totalOffLineEntries.avg();
}
/**
* Returns the number of status check exceptions in the last status check
* @return the number of status check exceptions in the last status check
*/
@ManagedMetric(category="ChronicleTimeSeries", displayName="LastStatusExceptions", metricType=MetricType.GAUGE, description="The the number of statusExceptions in the last status check")
public long getLastStatusExceptions() {
return statusExceptions.isEmpty() ? -1L : statusExceptions.get(0);
}
/**
* Returns the rolling average of status check exceptions in the last 30 status checks
* @return the rolling average of status check exceptions in the last 30 status checks
*/
@ManagedMetric(category="ChronicleTimeSeries", displayName="StatusExceptions", metricType=MetricType.GAUGE, description="The the rolling average of statusExceptions in the last 30 status checks")
public long getAverageStatusExceptions() {
return statusExceptions.isEmpty() ? -1L : statusExceptions.avg();
}
}