/* * ALMA - Atacama Large Millimiter Array * Copyright (c) European Southern Observatory, 2011 * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ package alma.acs.monitoring.blobber; import java.net.DatagramPacket; import java.net.DatagramSocket; import java.net.InetAddress; import java.net.UnknownHostException; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.concurrent.atomic.AtomicLong; import java.util.logging.Level; import java.util.logging.Logger; import alma.ACSErrTypeCommon.wrappers.AcsJCouldntCreateObjectEx; import alma.JavaContainerError.wrappers.AcsJContainerServicesEx; import alma.MonitorArchiver.CollectorListStatus; import alma.TMCDB.MonitorBlob; import alma.TMCDB.MonitorCollector; import alma.TMCDB.MonitorCollectorHelper; import alma.TMCDB.MonitorCollectorOperations; import alma.TMCDB.MonitorDataBlock; import alma.acs.concurrent.ThreadLoopRunner.CancelableRunnable; import alma.acs.container.ContainerServices; import alma.acs.logging.AcsLogLevel; import alma.acs.monitoring.MonitorPointTimeSeries; import alma.acs.monitoring.DAO.MonitorDAO; import alma.acs.monitoring.blobber.CollectorList.CollectorData; import alma.acs.util.IsoDateFormat; import alma.acs.util.StopWatch; /** * The part of the blobber component that does the real work. * It implements Runnable to allow running it repeatedly in a separate thread. */ public class BlobberWorker extends CancelableRunnable { public static final String BLOBBER_CHECK_JVM_MEMORY_PROPERTYNAME = "alma.acs.monitoring.blobber.checkmemory"; /** * DAOs store the data in the DB or on file, inside proper transaction(s) and performing auto-completion of hardware tables if necessary. */ protected List<MonitorDAO> myMonitorDAOList; // /** // * WatchDog to get blobber queues statistics. Currently not used in the ACS layers of blobber comp. // */ // protected BlobberWatchDog myWatchDog; /** * List of monitor collectors that this blobber serves. */ private final CollectorList myCollectorList = new CollectorList(); protected ContainerServices myContainerServices; protected Logger myLogger; /** * Maximum number of monitor collectors that a blobber accepts to handle. */ private volatile int myMaxCollectorCount = 6; /** * The interval between blobber cycles, each harvesting all collectors. * This is used only for logging, since the cycle gets triggered from outside of this class. */ protected long collectIntervalSec; /** * Remembers names of properties for which a failure has been logged already, * to avoid repeating this log. * <p> * We store the property name with full path, e.g. <code>CONTROL/AOSTiming/PSCR:STATUS</code>. */ private final HashSet<String> loggedFailedStore = new HashSet<String>(); /** * See "<code>archive.tmcdb.monitoring.profiling</code>" in archiveConfig.properties, * and {@link #debugDataSender}. */ private boolean isProfilingEnabled; /** * Used for special UDP debug messages, see {@link #isProfilingEnabled}. */ private DebugDataSender debugDataSender; /** * Counts how often the blobber has run in total. */ private final AtomicLong cycleCount = new AtomicLong(0); /** * Was added on ACS-9_0_0-B but then not merged to the HEAD in the old location ARCHIVE/TMCDB/. * @TODO replace with generic component ref cache */ private final HashMap<String, MonitorCollectorOperations> collectorName2ComponentReference = new HashMap<String, MonitorCollectorOperations>(); private final MonitorPointExpert monitorPointExpert; private final AnyExtractor anyExtractor; /** * @param inContainerServices used for logging and to get references to the collector components. * @param blobberPlugin * @throws AcsJCouldntCreateObjectEx If <code>blobberPlugin#createMonitorDAO</code> fails. */ public BlobberWorker(ContainerServices inContainerServices, BlobberPlugin blobberPlugin) throws AcsJCouldntCreateObjectEx { this.myContainerServices = inContainerServices; myLogger = myContainerServices.getLogger(); this.isProfilingEnabled = blobberPlugin.isProfilingEnabled(); notifyCollectorIntervalChange(blobberPlugin.getCollectorIntervalSec()); initWorker(); this.myMonitorDAOList = blobberPlugin.getMonitorDAOs(); // this.myWatchDog = blobberPlugin.getBlobberWatchDog(); monitorPointExpert = blobberPlugin.getMonitorPointExpert(); anyExtractor = new AnyExtractor(myLogger, monitorPointExpert); } protected void initWorker() { if (isProfilingEnabled) { try { debugDataSender = new DebugDataSender(myContainerServices.getName()); myLogger.fine("Blobber profiling enabled: " + debugDataSender.toString()); } catch (java.net.UnknownHostException e) { isProfilingEnabled = false; myLogger.warning("Failed to enable blobber profiling: " + DebugDataSender.targetHost + " could not be resolved."); } catch (Exception e) { isProfilingEnabled = false; myLogger.log(Level.WARNING, "Failed to enable blobber profiling.", e); } } else { myLogger.fine("Blobber profiling NOT enabled."); } } /** * @see #collectIntervalSec * @see BlobberImpl#setCollectorIntervalSeconds(long) */ void notifyCollectorIntervalChange(long inCollectIntervalSec) { this.collectIntervalSec = inCollectIntervalSec; } /** * @return The maximum number of collectors that this blobber is allowed to have. */ protected int getMaxCollectorCount() { return myMaxCollectorCount; } protected void setMaxCollectorCount(int inCount) { if (inCount <= 0) { throw new IllegalArgumentException(inCount + " must be above 0."); } myMaxCollectorCount = inCount; } /** * Adds the collector component of the given name to the blobber. If the collector is already * known to the blobber it will not be added again. * <p> * HSO TODO: what is meant by the following description? I don't see any of this in the code. <br> * This method will however * NOT wait for this, it returns immediately with one exception. When the * second collector is added it is first ensured that one access has * completed to the firstly added collector in order to wait for the access * statistics needed to calculate the capacity of the blobber. * @param inCollectorComponentId a String stating the name of the collector to add. * @return a CollectorListStatus stating the result. Values to expect are * {@link CollectorListStatus#ADDED}, * {@link CollectorListStatus#KNOWN} or * {@link CollectorListStatus#FULL}. */ public CollectorListStatus addCollector(String inCollectorComponentId) { myLogger.fine("Trying to add collector " + inCollectorComponentId); CollectorListStatus outStatus = CollectorListStatus.FULL; if (canHandle()) { outStatus = this.myCollectorList.add(inCollectorComponentId); if (outStatus == CollectorListStatus.ADDED) { myLogger.fine("Collector " + inCollectorComponentId + " added."); } else { if (myLogger.isLoggable(Level.FINE)) { myLogger.fine("Collector " + inCollectorComponentId + " already contained in collector list, not added again."); } } } else { myLogger.fine("The blobber has reached its capacity, collector " + inCollectorComponentId + " not added."); } return outStatus; } protected boolean canHandle() { return this.myCollectorList.size() < getMaxCollectorCount(); } /** * Removes the stated collector from the blobber. If the worker is currently * working with the specified collector it will finish the operation. This * method will however NOT wait for the operation, it returns immediately. * * @param inCollectorComponentId a String stating the name of the collector to remove. * @return a CollectorListStatus stating the result. Values to expect are * {@link CollectorListStatus#UNKNOWN} or * {@link CollectorListStatus#REMOVED}. */ public CollectorListStatus removeCollector(String inCollectorComponentId) { myLogger.fine("Trying to remove collector " + inCollectorComponentId); CollectorListStatus outStatus = this.myCollectorList.remove(inCollectorComponentId); if (outStatus == CollectorListStatus.REMOVED) { myLogger.fine("Collector removed."); } else { myLogger.fine("Collector not found in collector list, hence not removed."); } return outStatus; } /** * Checks if the collector list contains the stated collector. * * @param inCollectorComponentId a String stating the name of the collector to look for. * @return a CollectorListStatus stating the result. Values to expect are * {@link CollectorListStatus#KNOWN} or * {@link CollectorListStatus#UNKNOWN}. */ public CollectorListStatus containsCollector(String inCollectorComponentId) { return myCollectorList.contains(inCollectorComponentId); } /** * Gets the reference of the {@link MonitorCollector} component that is identified by its name * <code>inCollectorName</code> which could for example be "CONTROL/CM01/MONITOR_COLLECTOR". * <p> * Collector references are cached in {@link #collectorName2ComponentReference} across blobber cycles. */ protected MonitorCollectorOperations getMonitorCollector(String inCollectorName) throws AcsJContainerServicesEx { MonitorCollectorOperations ret = collectorName2ComponentReference.get(inCollectorName); if (ret == null) { if (myLogger.isLoggable(Level.FINE)) { myLogger.fine("Trying to fetch collector for container " + inCollectorName); } ret = MonitorCollectorHelper.narrow(myContainerServices.getComponent(inCollectorName)); collectorName2ComponentReference.put(inCollectorName, ret); } return ret; } /** * This method will be called at fixed intervals. It gathers the data from all registered collectors and stores it * in the database, using the layer from module TMCBD/DAO. * * @see java.lang.Runnable#run() */ @Override public void run() { cycleCount.incrementAndGet(); myLogger.fine("Running BlobberWorker cycle " + cycleCount); int collectorCount = 0; int insertCount = 0; StopWatch stopWatchAllCollectors = new StopWatch(myLogger); // Checking memory requires a GC run (and no other components running in the same container) // to give reasonable results. Running GC from inside the program we don't want to do as default, // thus the use of the cheating property. long usedMemKBBeforeCycle = -1; if (Boolean.getBoolean(BLOBBER_CHECK_JVM_MEMORY_PROPERTYNAME)) { System.gc(); Runtime rt = Runtime.getRuntime(); usedMemKBBeforeCycle = (rt.totalMemory() - rt.freeMemory()) / 1024; myLogger.fine("Used JVM memory in kB after GC before blobber cycle: " + usedMemKBBeforeCycle); } // loop over all collectors. Note that collectors can be added and removed during this loop, // so that the initial list size does not necessarily show the number of executed collectors. CollectorData collectorData = null; myCollectorList.resetIterator(); while (myCollectorList.hasNext()) { try { collectorData = myCollectorList.next(); // has this thread been requested to terminate? if (shouldTerminate) { myLogger.info("Loop over collectors terminated prematurely, skipping '" + collectorData.getCollectorId() + "' and subsequent collectors."); break; } collectorCount++; StopWatch stopWatchCurrentCollector = new StopWatch(myLogger); // Get the corba ref for the current collector MonitorCollectorOperations collector = getMonitorCollector(collectorData.getCollectorId()); // The data retrieval, processing, and storage happens here insertCount += harvestCollector(collectorData, collector); stopWatchCurrentCollector.logLapTime("process monitoring data from collector " + collectorData.getCollectorId()); } catch (Exception e) { myLogger.log(Level.WARNING, "Exception caught while processing monitor collector " + collectorData.getCollectorId() + "; the data cache for this collector will be cleared, the data is LOST FOREVER", e); // @TODO Shouldn't we raise an alarm also here, now that we do when blobber comp fails to initialize? // Then it would need to be cleared once (the same collector's??) data is processed ok in the next round. } } if (Boolean.getBoolean(BLOBBER_CHECK_JVM_MEMORY_PROPERTYNAME)) { System.gc(); Runtime rt = Runtime.getRuntime(); long usedMemKBAfterCycle = (rt.totalMemory() - rt.freeMemory()) / 1024; myLogger.fine("Used JVM memory in kB after GC after blobber cycle: " + usedMemKBAfterCycle); } long totalTimeMillis = stopWatchAllCollectors.getLapTimeMillis(); if (totalTimeMillis < collectIntervalSec * 1000) { // the good case: all collectors were processed within the foreseen time window String msg = "Processed monitoring data from " + collectorCount + " collector(s) in " + totalTimeMillis + " ms (within the time limit)."; myLogger.fine(msg); if (isProfilingEnabled) { debugDataSender.sendUDPPacket(msg, cycleCount.longValue()); debugDataSender.sendUDPPacket("Total inserts for cycle " + cycleCount + " were " + insertCount, cycleCount.longValue()); } } else { // the bad case: this run took too long. String msg = "Processed monitoring data from " + collectorCount + " collector(s) in " + totalTimeMillis + " ms (exceeding the time limit of " + collectIntervalSec + " s)."; myLogger.warning(msg); if (isProfilingEnabled) { debugDataSender.sendUDPPacket(msg, cycleCount.longValue()); debugDataSender.sendUDPPacket("Total inserts for cycle " + cycleCount + " were " + insertCount, cycleCount.longValue()); } } } /** * Retrieves and processes the data from one monitor collector component, that is, from one container. * <p> * Storage details: * <ul> * <li>Uses {@link MonitorDAO} to control DB transactions and store the data. * <li>Uses a total of one DB transaction (for the data from all properties of all devices, which comes in many * {@link MonitorDataBlock}s). * </ul> * * @param collectorData * The collector ID / name, used for logging and stats. * @param collector * Reference to the collector component * @return Number of attempted or successful DB inserts, i.e., calls to * {@link alma.archive.tmcdb.DAO.MonitorDAO#store(alma.archive.tmcdb.DAO.ComponentData)}. * @throws Exception */ private int harvestCollector(CollectorData collectorData, MonitorCollectorOperations collector) throws Exception { int insertCount = 0; myLogger.fine("About to call " + collectorData.getCollectorId() + "#getMonitorData()"); MonitorDataBlock[] dataBlocks = collector.getMonitorData(); collectorData.setLastSuccessfulAccessTime(System.currentTimeMillis()); if (dataBlocks != null) { // HSO TODO: do not call name() on the remote component, but use the local getCollectorId which happens to // be the name. if (myLogger.isLoggable(Level.FINE)) { myLogger.fine("Received " + dataBlocks.length + " MonitorDataBlocks from collector " + collector.name()); } if (isProfilingEnabled) { debugDataSender.sendUDPPacket("Received " + dataBlocks.length + " MonitorDataBlocks from collector " + collector.name(), cycleCount.longValue()); Runtime runtime = Runtime.getRuntime(); debugDataSender.sendUDPPacket("Used memory: " + Long.toString((runtime.totalMemory() - runtime.freeMemory())), cycleCount.longValue()); debugDataSender.sendUDPPacket("Free memory: " + Long.toString(runtime.freeMemory()), cycleCount.longValue()); debugDataSender.sendUDPPacket("Total memory: " + Long.toString(runtime.totalMemory()), cycleCount.longValue()); } for (MonitorDAO monitorDAO : myMonitorDAOList) { monitorDAO.openTransactionStore(Long.toString(cycleCount.longValue()) + "-" + collectorData.getCollectorId()); // @TODO: Should we catch / log the possible Exception, or just let it fly? } // iterate over devices for (MonitorDataBlock block : dataBlocks) { myLogger.log(AcsLogLevel.DEBUG, "MonitorDataBlock for device " + block.componentName + " contains " + block.monitorBlobs.length + " MonitorBlobs"); // iterate over properties for (MonitorBlob blob : block.monitorBlobs) { BlobData blobData = null; try { String propertyName = blob.propertyName; String propertyNameSimple = propertyName.substring(propertyName.indexOf(':') + 1); // TODO: Shouldn't we pass propertyName instead of propertyNameSimple? List<MonitorPointTimeSeries> containerList = anyExtractor.extractData(blob.blobDataSeq, propertyNameSimple); // iterate over expanded logical properties for (int index = 0; index < containerList.size(); index++) { MonitorPointTimeSeries container = containerList.get(index); // The serial number by default comes from the device, // but it can be overwritten for a monitor point. // In Alma this is done for correlator properties. String serialNumber = block.deviceSerialNumber; if (blob.propertySerialNumber != null) { if (blob.propertySerialNumber.length == 1) { // One SN per baci property, even if it expands to multiple monitor points. serialNumber = blob.propertySerialNumber[0]; } else if (blob.propertySerialNumber.length > index) { serialNumber = blob.propertySerialNumber[index]; } else if (blob.propertySerialNumber.length > 0) { this.myLogger.warning("Underspecified MonitorBlob#propertySerialNumber for " + blob.propertyName); } } myLogger.log(AcsLogLevel.DEBUG, "Handling data for property " + blob.propertyName); blobData = createBlobData(block, blob, container, propertyNameSimple, serialNumber, myLogger); if (blobData.getDataSize() > 0) { insertCount++; // hand over our blob data to the DAO(s) storeData(blobData); } } } catch (Exception e) { if (!this.loggedFailedStore.contains(blob.propertyName)) { this.loggedFailedStore.add(blob.propertyName); // @TODO check if http://jira.alma.cl/browse/COMP-4512 can be closed String msg = "Problem when handling property [" + blob.propertyName + "]. " + "The data cache for this property in collector [" + collectorData.getCollectorId() + "] will be cleared, the data is NOT stored. "; myLogger.log(Level.WARNING, msg, e); // unclear if we want this log. If so, it should be included in the above log, // to avoid spreading the info over many log records that won't be adjacent in jlog. // see alma.archive.tmcdb.DAO.ComponentData.toString() where currently the clob data is excluded. myLogger.log(Level.WARNING, "The data contained in the data cache: " + blobData); } else { // RK: Matthias suggested we log this too // HSO commented it out again, because now instead we log this AcsJStoreFailureEx // directly in alma.archive.tmcdb.DAO.MonitorDAOImpl.store(ComponentData) at level FINER // myLogger.log(Level.WARNING, // "Repeat of problem when handling property [" // + blob.propertyName + "]", e); } } } // end loop over properties of a single device } // end loop over devices of a single container // close the transaction for (MonitorDAO monitorDAO : myMonitorDAOList) { try { myLogger.fine("myMonitorDAO.closeTransactionStore() for: " + dataBlocks.length + " MonitorDataBlocks from collector " + collector.name()); monitorDAO.closeTransactionStore(); } catch (Exception ex) { myLogger.log(Level.WARNING, "Exception caught. Some monitoring data couldn't be archived", ex); } } } return insertCount; } /** * Creates a BlobData object and fills it with data from the given block, blob etc. * Broken out from {@link #harvestCollector(CollectorData, MonitorCollectorOperations)} * to allow fine-grained unit testing. */ static BlobData createBlobData(MonitorDataBlock block, MonitorBlob blob, MonitorPointTimeSeries mpTs, String propertyNameSimple, String serialNumber, Logger logger) { BlobData blobData = new BlobData(mpTs, logger); blobData.componentName = block.componentName; blobData.serialNumber = serialNumber; blobData.propertyName = propertyNameSimple; blobData.startTime = block.startTime; blobData.index = mpTs.getMonitorPointIndex(); blobData.startTime = block.startTime; blobData.stopTime = block.stopTime; blobData.calculateStatistics(); return blobData; } /** * Stores the data using the MonitorPointExpertImpl layer. */ protected void storeData(BlobData inBlobData) throws Exception { if (myLogger.isLoggable(Level.FINEST)) { // we only log at level finest because the same log will also come from inside the DAO#store implementation // @HSO TODO: Include the "clob" field again in toString, but use a different method here // which will not log clob data. myLogger.finest("Storing property data for " + inBlobData.componentName + ": " + inBlobData.toString()); } Exception firstExInDAOStore = null; for (MonitorDAO monitorDAO : myMonitorDAOList) { try { monitorDAO.store(inBlobData); } catch (Exception ex) { if (firstExInDAOStore == null) { // throw this later, but first store in the other DAOs firstExInDAOStore = ex; } } } if (firstExInDAOStore != null) { throw firstExInDAOStore; } } /** * An interesting, non-standard way of supplying a listening GUI application with debug logs, * without having to tap the log stream. */ private static class DebugDataSender { public static final String targetHost = "tmc-services.aiv.alma.cl"; private InetAddress address; private String location; private String blobberName; public DebugDataSender(String blobberName) throws UnknownHostException { this.blobberName = blobberName; location = System.getenv("LOCATION"); if (location == null) { location = "GENERIC"; } address = InetAddress.getByName(targetHost); } protected void sendUDPPacket(String msg, long cycleCount) { try { int port = 9090; String date = IsoDateFormat.formatCurrentDate(); String compMsg = location + " | " + blobberName + " | " + cycleCount + " | " + date + "|" + msg; byte[] message = compMsg.getBytes(); // Initialize a datagram packet with data and address DatagramPacket packet = new DatagramPacket(message, message.length, address, port); // Create a datagram socket, send the packet through it, close it. // @TODO (HSO): can/should we reuse this DatagramSocket across calls? DatagramSocket dsocket = new DatagramSocket(); dsocket.send(packet); dsocket.close(); } catch (Exception e) { // do not use the logger here, since the whole purpose of using DebugDataSender is to produce debug data // without jamming the log channels. System.err.println("Exception caught while sending UDP packet:" + e.getMessage()); } } public String toString() { return "location=" + location + " address=" + address + " blobberName=" + blobberName; } } }