/* * NOTE: This copyright does *not* cover user programs that use HQ * program services by normal system calls through the application * program interfaces provided as part of the Hyperic Plug-in Development * Kit or the Hyperic Client Development Kit - this is merely considered * normal use of the program, and does *not* fall under the heading of * "derived work". * * Copyright (C) [2004, 2005, 2006], Hyperic, Inc. * This file is part of HQ. * * HQ is free software; you can redistribute it and/or modify * it under the terms version 2 of the GNU General Public License as * published by the Free Software Foundation. This program is distributed * in the hope that it will be useful, but WITHOUT ANY WARRANTY; without * even the implied warranty of MERCHANTABILITY or FITNESS FOR A * PARTICULAR PURPOSE. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 * USA. */ package org.hyperic.hq.measurement.agent.server; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.IOException; import java.util.ArrayList; import java.util.Calendar; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.Properties; import java.util.Set; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; import javax.net.ssl.SSLPeerUnverifiedException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.hyperic.hq.agent.server.AgentStartException; import org.hyperic.hq.agent.server.AgentStorageException; import org.hyperic.hq.agent.server.AgentStorageProvider; import org.hyperic.hq.agent.server.monitor.AgentMonitorException; import org.hyperic.hq.agent.server.monitor.AgentMonitorSimple; import org.hyperic.hq.bizapp.agent.CommandsAPIInfo; import org.hyperic.hq.bizapp.client.AgentCallbackClientException; import org.hyperic.hq.bizapp.client.MeasurementCallbackClient; import org.hyperic.hq.bizapp.client.StorageProviderFetcher; import org.hyperic.hq.common.SystemException; import org.hyperic.hq.measurement.data.DSNList; import org.hyperic.hq.measurement.data.MeasurementReport; import org.hyperic.hq.measurement.data.MeasurementReportConstructor; import org.hyperic.hq.measurement.server.session.SRN; import org.hyperic.hq.product.MetricValue; import org.hyperic.hq.util.Reference; import org.hyperic.util.StringUtil; import org.hyperic.util.encoding.Base64; /** * Deals with sending measurements back to the server (including * persisting them on disk.) */ public class SenderThread extends AgentMonitorSimple implements Sender, Runnable { private static final String PROP_METRICDUP = "agent.metricDup"; private static final String PROP_MAXBATCHSIZE = "agent.maxBatchSize"; private static final String PROP_METRICDEBUG = "agent.metricDebug"; // The threshold for logging server offset issues in the agent log. // XXX: May need to revisit this, potentially alot of output for // agents that are more than 3 minute ahead or behind the server. // (this may be a good thing) private static final long MAX_SERVERDIFF = 3 * 60 * 1000; private static final int PROP_RECSIZE = 34; // 34 byte records. private static final int SEND_INTERVAL = 60000; private static final int SLEEP_INTERVAL = 300000; private static final int MAX_BATCHSIZE = 500; private static final String MEASURENENT_LISTNAME = "measurement_spool"; private static final String AVAILABILITY_LISTNAME = "availability_spool"; private static AtomicBoolean stopSendingData = new AtomicBoolean(); private static AtomicInteger stopSendingDataCounter = new AtomicInteger(); private volatile boolean shouldDie; private final Log log; private final MeasurementCallbackClient client; private final AgentStorageProvider storage; private String agentToken; private final LinkedList<Record> transitionQueue; // This toggle will avoid displaying non-stop messages about server down private int metricDup = 0; private int maxBatchSize = MAX_BATCHSIZE; private final Set metricDebug; private final MeasurementSchedule schedule; // Current difference time between the server and agent in ns. // Update on each call to sendMeasurementReport(). private long serverDiff = 0; // Statistics private long stat_numBatchesSent = 0; private long stat_totBatchSendTime = 0; private long stat_totMetricsSent = 0; private Set<Record> lastRecords = new HashSet<Record>(); SenderThread(Properties bootProps, AgentStorageProvider storage, MeasurementSchedule schedule) throws AgentStartException { String sMetricDup, sMaxBatchSize, sMetricDebug; this.log = LogFactory.getLog(SenderThread.class); this.shouldDie = false; this.storage = storage; this.client = setupClient(); this.transitionQueue = new LinkedList<Record>(); this.metricDebug = new HashSet(); this.schedule = schedule; String measuementInfo = bootProps.getProperty(MEASURENENT_LISTNAME); if (measuementInfo != null) { storage.addOverloadedInfo(MEASURENENT_LISTNAME, measuementInfo); } String AvailabilityInfo = bootProps.getProperty(AVAILABILITY_LISTNAME); if (AvailabilityInfo != null) { storage.addOverloadedInfo(AVAILABILITY_LISTNAME, AvailabilityInfo); } // Setup our storage list try { // Create list early since we want a smaller recordsize // than the default of 1k. this.storage.createList(MEASURENENT_LISTNAME, PROP_RECSIZE); this.storage.createList(AVAILABILITY_LISTNAME, PROP_RECSIZE); } catch (AgentStorageException ignore) { // Most likely an agent update where the existing rt schedule // already exists. Will fall back to the old 1k size. } sMetricDup = bootProps.getProperty(PROP_METRICDUP); if(sMetricDup != null){ try { this.metricDup = Integer.parseInt(sMetricDup); } catch(NumberFormatException exc){ throw new AgentStartException(PROP_METRICDUP + " is not " + "a valid integer ('" + sMetricDup + "')"); } this.log.info("Duplicating metrics " + this.metricDup + " times"); } sMaxBatchSize = bootProps.getProperty(PROP_MAXBATCHSIZE); if(sMaxBatchSize != null){ try { this.maxBatchSize = Integer.parseInt(sMaxBatchSize); } catch(NumberFormatException exc){ throw new AgentStartException(PROP_MAXBATCHSIZE + " is not " + "a valid integer ('" + sMaxBatchSize + "')"); } } sMetricDebug = bootProps.getProperty(PROP_METRICDEBUG); if(sMetricDebug != null){ try { for (Object element : StringUtil.explode(sMetricDebug, " ")) { Integer metId = new Integer((String)element); this.metricDebug.add(metId); this.log.info("metricDebug: Enabling special debugging " + "for metric id=" + metId); } } catch(NumberFormatException exc){ throw new AgentStartException(PROP_METRICDEBUG + " must " + "contain integers seperated by "+ "spaces"); } } this.log.info("Maximum metric batch size set to " + this.maxBatchSize); } private MeasurementCallbackClient setupClient() throws AgentStartException { StorageProviderFetcher fetcher; fetcher = new StorageProviderFetcher(this.storage); return new MeasurementCallbackClient(fetcher); } void die(){ this.shouldDie = true; } // Use a small class which holds a bunch of the data we need, just so // we can pass it back in 1 method call private static class Record { final int dsnId, derivedID; final MetricValue data; private Integer hashCode = null; //This field is not serialized private final boolean isAvail; private Record(int dsnId, MetricValue data, int derivedID, boolean isAvail){ this.dsnId = dsnId; this.data = data; this.derivedID = derivedID; this.isAvail = isAvail; } public Record(int dsnID, MetricValue measVal, int derivedID) { this (dsnID, measVal, derivedID, false); } @Override public int hashCode() { if (hashCode != null) { return hashCode.intValue(); } final Long timestamp = new Long(data.getTimestamp()*71); final Integer mId = new Integer(derivedID*71); hashCode = new Integer(7 + (timestamp.hashCode()*71) + (mId.hashCode()*71)); return hashCode.intValue(); } @Override public boolean equals(Object rhs) { if (this == rhs) { return true; } if (rhs instanceof Record) { Record r = (Record)rhs; return ((r.derivedID == derivedID) && (r.data.getTimestamp() == data.getTimestamp())); } return false; } @Override public String toString() { return "mId="+derivedID+",timestamp="+data.getTimestamp()+ ",value="+data.getValue()+",isAvail="+isAvail; } } private static Record decodeRecord(String val) throws IOException { ByteArrayInputStream bIs; DataInputStream dIs; MetricValue measVal; boolean isAvail; int derivedID, dsnID; long retTime; bIs = new ByteArrayInputStream(Base64.decode(val)); dIs = new DataInputStream(bIs); derivedID = dIs.readInt(); retTime = dIs.readLong(); dsnID = dIs.readInt(); measVal = new MetricValue(dIs.readDouble(), retTime); return new Record(dsnID, measVal, derivedID); } private static String encodeRecord(Record record) throws IOException { ByteArrayOutputStream bOs; DataOutputStream dOs; bOs = new ByteArrayOutputStream(); dOs = new DataOutputStream(bOs); dOs.writeInt(record.derivedID); dOs.writeLong(record.data.getTimestamp()); dOs.writeInt(record.dsnId); dOs.writeDouble(record.data.getValue()); return Base64.encode(bOs.toByteArray()); } public void processData(int dsnId, MetricValue data, int derivedID, boolean isAvail){ double val; val = data.getValue(); if(this.metricDebug.contains(new Integer(derivedID))){ this.log.info("metricDebug: Placing DSN='" + dsnId + "' derivedID=" + derivedID + " value=" + val + " time=" + data.getTimestamp() + " into transition queue"); } synchronized(this.transitionQueue){ MetricValue measVal = new MetricValue(val, data.getTimestamp()); this.transitionQueue.add(new Record(dsnId, measVal, derivedID, isAvail)); } if(this.metricDup != 0){ ArrayList<Record> dupeList; long dTime; dupeList = new ArrayList<Record>(); dTime = data.getTimestamp(); for(int i=0; i<this.metricDup; i++){ MetricValue measVal = new MetricValue(val, dTime + i + 1); dupeList.add(new Record(dsnId, measVal, derivedID, isAvail)); } synchronized(this.transitionQueue){ this.transitionQueue.addAll(dupeList); } } } /** * This routine moves all the data from the transition queue into * the storage provider, so it can be shipped to the server. */ private void processTransitionQueue(){ String encodedRec; synchronized(this.transitionQueue){ for (Record rec : this.transitionQueue) { try { encodedRec = encodeRecord(rec); if (rec.isAvail) { this.storage.addToList(AVAILABILITY_LISTNAME, encodedRec); } else { this.storage.addToList(MEASURENENT_LISTNAME, encodedRec); } } catch(Exception exc){ this.log.error("Unable to store data: " + exc, exc); } } this.transitionQueue.clear(); try { this.storage.flush(); } catch(Exception exc){ this.log.error("Unable to flush storage", exc); } } } /** * Send a batch of measurement points back to the server. This * method sends at most maxBatchSize elements back. * * @return null if all the measurements in the list have been * sent, otherwise it returns the timestamp of the last * measurement sent to the server. */ private Long sendBatch(String listName, Reference<Integer> numSent) { MeasurementReportConstructor constructor; DSNList[] clientIds; long batchStart = 0, batchEnd = 0, lastMetricTime = 0, serverTime = 0; boolean success; int numUsed, numDebuggedSent; // Before sending the data off, make sure our transition queue is // empty this.processTransitionQueue(); numUsed = 0; numDebuggedSent = 0; constructor = new MeasurementReportConstructor(); final boolean debug = log.isDebugEnabled(); log.debug("Sending batch to server:"); Set<Record> records = new HashSet<Record>(); Set<Record> currentRecords = new HashSet<Record>(); boolean alreadyExist = false; // first we are going to ensure that all the data points that // we send over to the server are unique for (Iterator<String> it=storage.getListIterator(listName); (it!=null) && it.hasNext() && (numUsed < maxBatchSize); numUsed++) { try { Record r = SenderThread.decodeRecord(it.next()); currentRecords.add(r); //check of we added the record in our previous bulk if(lastRecords.contains(r)){ alreadyExist = true; }else{ alreadyExist = !records.add(r); } if (alreadyExist) { // nuke the dup if (debug) { log.debug("Dropping duplicate entry for " + r); } numUsed--; } } catch(IOException exc){ this.log.error("Error accessing record -- deleting: " + exc, exc); continue; } } int num = 0; long firstMetricTime = Long.MAX_VALUE; for (Iterator<Record> it=records.iterator(); it.hasNext(); num++) { Record rec = it.next(); lastMetricTime = rec.data.getTimestamp(); if(log.isDebugEnabled()){ this.log.debug(" Data: d=" + rec.derivedID + " r=" + rec.dsnId + " t=" + rec.data.getTimestamp() + " v=" + rec.data); } if(this.metricDebug.contains(new Integer(rec.derivedID))){ numDebuggedSent++; this.log.info("metricDebug: Pulled DSN=" + rec.dsnId + " derivedID=" + rec.derivedID + " value=" + rec.data.getValue() + " from tQueue -- sending"); } constructor.addDataPoint(rec.derivedID, rec.dsnId, rec.data); } numSent.set(num); // If we don't have anything to send -- move along if(numUsed == 0) { return null; } clientIds = constructor.constructDSNList(); success = false; try { MeasurementReport report; SRN[] srnList = this.schedule.getSRNsAsArray(); if (srnList.length == 0) { log.error("Agent does not have valid SRNs, but has metric data to send, removing measurements"); removeMeasurements(numUsed, listName); return null; } if(log.isDebugEnabled()){ for (SRN element : srnList) { this.log.debug(" SRN: " + element.getEntity() + "=" + element.getRevisionNumber()); } } report = new MeasurementReport(); if (this.agentToken == null) { this.agentToken = storage.getValue(CommandsAPIInfo.PROP_AGENT_TOKEN); } report.setAgentToken(this.agentToken); report.setClientIdList(clientIds); report.setSRNList(srnList); batchStart = now(); try { serverTime = this.client.measurementSendReport(report); } catch (IllegalArgumentException e) { throw new SystemException("error sending report: " + e + ", report=" + report, e); } batchEnd = now(); // Compute offset from server (will include network latency) this.serverDiff = Math.abs(serverTime - batchEnd); // Update the ServerTimeDiff object with the time offset between the // agent and the server and update the last sync time to now ServerTimeDiff.getInstance().setServerTimeDiff(serverTime - batchEnd); ServerTimeDiff.getInstance().setLastSync(batchEnd); if (this.serverDiff > MAX_SERVERDIFF) { // Complain if we are ahead or behind. This may be a bit // too excessive. if (serverTime < batchEnd) { this.log.error("Agent is " + (this.serverDiff / 1000) + " seconds ahead of the server. To " + "ensure accuracy of the charting and " + "alerting make sure the agent and server " + "clocks are synchronized"); } else { this.log.error("Agent is " + (this.serverDiff / 1000) + " seconds behind the server. To " + "ensure accuracy of the charting and " + "alerting make sure the agent and server " + "clocks are synchronized"); } } success = true; lastRecords = currentRecords; } catch(AgentCallbackClientException exc){ log.error("Error sending measurements: " + exc.getMessage(), exc); // return this so that the caller will attempt a retry on everything except Connection refused if (shouldRetry(exc)){ log.info("retrying measurement send"); return firstMetricTime; } } finally { if(numDebuggedSent != 0){ if(success){ this.log.info("metricDebug: Successfully sent " + numDebuggedSent + " debugged metrics to " + "server"); } else { this.log.info("metricDebug: Server reported failure " + "when sent " + numDebuggedSent + " debugged metrics"); } } } if(success){ removeMeasurements(numUsed, listName); this.stat_numBatchesSent++; this.stat_totBatchSendTime += (batchEnd - batchStart); this.stat_totMetricsSent += numUsed; if(numUsed == this.maxBatchSize){ return new Long(lastMetricTime); } else { return null; } } return null; } private long now() { return System.currentTimeMillis(); } /** * @return The number of measurements removed from the metric storage. * * @param num The maximum number of datapoints to remove. */ private int removeMeasurements(int num, String listName) { int j = 0; for (Iterator i = this.storage.getListIterator(listName); (i != null) && i.hasNext() && (j < num); j++) { i.next(); i.remove(); } try { this.storage.flush(); } catch (AgentStorageException exc) { this.log.error("Failed to flush agent storage", exc); } if(j != num){ this.log.error("Failed to remove " + (num - j) + "records"); } return j; } /** * MONITOR METHOD: Get the number of batches which have successfully * sent to the server */ public double getNumBatchesSent() throws AgentMonitorException { return this.stat_numBatchesSent; } /** * MONITOR METHOD: Get the total amount of time that the client has * spent sending batches */ public double getTotBatchSendTime() throws AgentMonitorException { return this.stat_totBatchSendTime; } /** * MONITOR METHOD: Get the total number of metrics which have been * transmitted to the server */ public double getTotMetricsSent() throws AgentMonitorException { return this.stat_totMetricsSent; } /** * MONITOR METHOD: Get the offset in ms between the agent and server */ public double getServerOffset() throws AgentMonitorException { return this.serverDiff; } public void run(){ Calendar controlCal = Calendar.getInstance(); controlCal.setTimeInMillis(System.currentTimeMillis()); Calendar cal = Calendar.getInstance(); controlCal.set(Calendar.SECOND, 5); while ((!this.shouldDie)&&(!stopSendingData.get())) { try { try { controlCal.add(Calendar.MINUTE, 1); long now = System.currentTimeMillis(); cal.setTimeInMillis(now + SEND_INTERVAL); // want to keep some randomness for all agents to send their // data. This way an agent is not pegged to a certain // second interval if (cal.get(Calendar.MINUTE) != controlCal.get(Calendar.MINUTE)) { long sleeptime = controlCal.getTimeInMillis() - now; if (sleeptime > 0) { Thread.sleep(controlCal.getTimeInMillis() - now); } } else { Thread.sleep(SEND_INTERVAL); } } catch(InterruptedException exc){ this.log.info("Measurement sender interrupted"); return; } if (log.isDebugEnabled()) { log.debug("Woke up, sending batch of metrics."); } //We are sending 2 different batches, the first is for availability and the second //is for measurement. We are doing this because we want to make sure that availability //data will get processed by the server even if the server is not able to process measurement //data at the moment (Jira issue [HHQ-5566]) Reference<Integer> numSent = new Reference<Integer>(0); sendData(AVAILABILITY_LISTNAME, numSent); sendData(MEASURENENT_LISTNAME, numSent); } catch (Throwable e) { log.error(e.getMessage(), e); } } } private void sendData(String listName, Reference<Integer> numSent) { Long lastMetricTime; lastMetricTime = this.sendBatch(listName, numSent); if(lastMetricTime != null){ String backlogNum = ""; final long start = System.currentTimeMillis(); // Give it a single shot to catch up before starting to squawk while((lastMetricTime = this.sendBatch(listName, numSent)) != null) { long now = System.currentTimeMillis(); long tDiff = now - lastMetricTime.longValue(); String backlog = Long.toString(tDiff / (60 * 1000)); if(((tDiff / (60 * 1000)) > 1) && (backlog.equals(backlogNum) == false)) { backlogNum = backlog; this.log.info(backlog + " minute(s) of metrics backlogged"); } if(this.shouldDie == true){ this.log.info("Dying with measurements backlogged"); return; } } final long total = System.currentTimeMillis() - start; if (total > SEND_INTERVAL) { log.info("Agent took " + (total/1000) + " seconds to send its " + listName + " metrics to the HQ Server."); } else if (log.isDebugEnabled()) { log.debug("Agent took " + total + " ms to send its " + listName); } } } // return this so that the caller will attempt a retry on everything except : // 1. Connection refused // 2. Permission Denied // 3. Service Unavailable // 4. SSLPeerUnverifiedException private boolean shouldRetry(AgentCallbackClientException exc) { //check if the server is down if(exc.getMessage().toLowerCase().endsWith("refused")){ log.error("Agent is not able to communicate with the server because Hyperic server is down" ); stopSendingDataCounter.set(0); return false; }else{ //check if the agent was deleted from the server if(exc.getMessage().endsWith(AgentCallbackClientException.PERMISSION_DENIED_ERROR_MSG)){ log.error("Agent is not able to communicate with the server because the agent was deleted from Hyperic server" + ", agent will stop sending data to Hyperic server. Please resetup the agent: stop the agent and delete data directory "); stopSendingData.set(true); return false; }else{ //check if there is a new certification in the server if((!(exc.getMessage().indexOf("Service Unavailable") != -1) && (exc.getExceptionOfType(SSLPeerUnverifiedException.class) == null)) ||(exc.getMessage().indexOf("peer not authenticated")!= -1)){ log.error("Agent is not able to communicate with the server because Hyperic server has a new certificate" + ", agent will stop sending data to Hyperic server. Please resetup the agent: stop the agent and delete data directory or " + "edit agent.properties file and change to: accept.unverified.certificates=true," +stopSendingDataCounter.get()); try { Thread.sleep(SEND_INTERVAL); } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } if(stopSendingDataCounter.incrementAndGet()>1000){ stopSendingData.set(true); } return false; }else{//Other error such as network, we should retry return true; } } } } }