package org.rhq.server.metrics.invalid; import static java.util.Arrays.asList; import java.util.ArrayList; import java.util.List; import java.util.concurrent.DelayQueue; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.joda.time.DateTime; import org.rhq.server.metrics.ArithmeticMeanCalculator; import org.rhq.server.metrics.DateTimeService; import org.rhq.server.metrics.MetricsConfiguration; import org.rhq.server.metrics.MetricsDAO; import org.rhq.server.metrics.domain.AggregateNumericMetric; import org.rhq.server.metrics.domain.Bucket; import org.rhq.server.metrics.domain.RawNumericMetric; /** * <p> * This class tries to deal with invalid aggregate metrics. An invalid metric is one where * either min > avg or max < avg. There are a couple different bugs which made these * situations possible See https://bugzilla.redhat.com/show_bug.cgi?id=1110462 and * https://bugzilla.redhat.com/show_bug.cgi?id=1104885 for details. * </p> * * <p> * When an invalid metric is found, it is {@link #submit(AggregateNumericMetric) submitted} * to an internal queue for later processing. Metrics will be recomputed if possible; * otherwise, they will be deleted. * </p> * * @author John Sanda */ public class InvalidMetricsManager { private static final Log log = LogFactory.getLog(InvalidMetricsManager.class); private static final double THRESHOLD = 0.00001d; private DateTimeService dateTimeService; private MetricsDAO dao; private InvalidMetric current; private MetricsConfiguration configuration; private DelayQueue<InvalidMetric> queue; private ScheduledExecutorService executor; /** * The queue delay is specified in milliseconds and defaults to 10 minutes. This is the * amount of time an invalid metric has to wait in the queue before it can be removed * and processed. */ private long delay = Long.parseLong(System.getProperty("rhq.metrics.invalid.queue-delay", "600000")); private boolean isShutdown; public InvalidMetricsManager(DateTimeService dateTimeService, MetricsDAO dao) { this(dateTimeService, dao, Integer.parseInt(System.getProperty("rhq.metrics.invalid.poller.initial-delay", "300")), Integer.parseInt(System.getProperty("rhq.metrics.invalid.poller.period", "300"))); } InvalidMetricsManager(DateTimeService dateTimeService, MetricsDAO dao, int pollerDelay, int pollerPeriod) { this.dateTimeService = dateTimeService; this.dao = dao; configuration = new MetricsConfiguration(); queue = new DelayQueue<InvalidMetric>(); executor = Executors.newSingleThreadScheduledExecutor(); executor.scheduleAtFixedRate(new InvalidMetricRunnable(), pollerDelay, pollerPeriod, TimeUnit.SECONDS); } /** * This is a test hook. * * @param delay The queue delay */ void setDelay(long delay) { this.delay = delay; } /** * Shuts down the executor, waiting for any in progress work to finish. Any invalid * metrics that are in the queue will not be processed. */ public void shutdown() { log.info("Shutting down..."); isShutdown = true; queue.clear(); executor.shutdown(); } /** * Submits an invalid metric for later processing which is done in a separate thread. * The queue actually stores instances of {@link InvalidMetric}. Metrics belonging to * the same measurement schedule and from the same day will only have one invalid metric * stored in the queue. In other words, suppose a 1 hour metric from 14:00 is submitted. * Then a 6 hour metric from 12:00 for the same schedule id is submitted. This will result * in only one {@link InvalidMetric} stored stored in the queue. The reason being that * we need to look at all of the 1 hour, 6 hour, and 24 hour metrics; so, multiple * entries in the queue for the same day will only result in duplicate work. * * @param metric The invalid metric where invalid means either min > avg or max < avg * @return true if an invalid metric is added to the queue, false otherwise */ public boolean submit(AggregateNumericMetric metric) { DateTime day = dateTimeService.getTimeSlice(new DateTime(metric.getTimestamp()), configuration.getSixHourTimeSliceDuration()); InvalidMetric invalidMetric = new InvalidMetric(day, metric, delay); if (isShutdown) { log.info(invalidMetric + " will not be submitted since we are already shutdown."); return false; } if (queue.contains(invalidMetric) || (current != null && current.equals(invalidMetric))) { log.info(invalidMetric + " is already in the queue. It will not be resubmitted."); return false; } log.info("Adding " + invalidMetric + " to queue for processing"); queue.offer(invalidMetric); return true; } /** * This is a test hook * * @return The queue of invalid metrics */ public DelayQueue<InvalidMetric> getQueue() { return queue; } /** * A test hook that returns the count of invalid metrics including the current one being * worked on. */ int getRemainingInvalidMetrics() { return current == null ? queue.size() : queue.size() + 1; } /** * When this runs, it drains all invalid metrics (whose delays have expired) from the * queue and will recompute and remove each metric. */ private class InvalidMetricRunnable implements Runnable { @Override public void run() { List<InvalidMetric> invalidMetrics = new ArrayList<InvalidMetric>(queue.size()); queue.drainTo(invalidMetrics); for (InvalidMetric invalidMetric : invalidMetrics) { current = invalidMetric; try { handleInvalidMetric(current); } catch (Exception e) { log.warn("An unexpected occurred while processing invalid metric " + current, e); } } current = null; } } private void handleInvalidMetric(InvalidMetric invalidMetric) { log.info("Attempting to fix " + invalidMetric + ". This may include updates to 1 hour, 6 hour, and 24 hour metrics."); if (invalidMetric.metric.getBucket() == Bucket.TWENTY_FOUR_HOUR) { update24HourMetric(invalidMetric); } else if (invalidMetric.metric.getBucket() == Bucket.SIX_HOUR) { if (DateTime.now().isAfter(invalidMetric.day.plusDays(1))) { update24HourMetric(invalidMetric); } else { update6HourMetrics(asList(invalidMetric.metric)); } } else { // 1 hr metric DateTime sixHourTimeSlice = dateTimeService.getTimeSlice(new DateTime(invalidMetric.metric.getTimestamp()), configuration.getOneHourTimeSliceDuration()); if (DateTime.now().isAfter(invalidMetric.day.plusDays(1))) { update24HourMetric(invalidMetric); } else if (DateTime.now().isAfter(sixHourTimeSlice.plusHours(6))) { List<AggregateNumericMetric> sixHourMetrics = dao.findAggregateMetrics( invalidMetric.metric.getScheduleId(), Bucket.SIX_HOUR, invalidMetric.day.getMillis(), invalidMetric.day.plusDays(1).getMillis()); update6HourMetrics(sixHourMetrics); } else { update1HourMetrics(asList(invalidMetric.metric)); } } } /** * This method first looks for the 6 hour metrics from which the 24 hour metric was * computed. If no 6 hour metrics are found, the 24 hour metric is deleted. If we * find 6 hour metrics, we then try to update the corresponding 1 hour metrics * followed by the 6 hour metrics, and lastly recompute the 24 hour metric. */ private void update24HourMetric(InvalidMetric invalidMetric) { List<AggregateNumericMetric> sixHourMetrics = dao.findAggregateMetrics(invalidMetric.metric.getScheduleId(), Bucket.SIX_HOUR, invalidMetric.day.getMillis(), invalidMetric.day.plusDays(1).getMillis()); if (sixHourMetrics.isEmpty()) { // This likely means that the 6 hour data has already expired. The best // we can do at this point is to delete to the invalid metric. log.info("Deleting " + invalidMetric + " since the 6 hour metrics are no longer available."); remove24HourMetric(invalidMetric.metric); } else { List<AggregateNumericMetric> updated6HourMetrics = update6HourMetrics(sixHourMetrics); AggregateNumericMetric recomputed24HourMetric = computeAggregate(updated6HourMetrics, invalidMetric.metric.getScheduleId(), invalidMetric.day.getMillis(), Bucket.TWENTY_FOUR_HOUR); persist24HourMetric(recomputed24HourMetric); log.info(invalidMetric + " has been recomputed with a new value of " + getValueText(recomputed24HourMetric)); } } /** * This method first looks for and deletes any empty 6 hour metrics. It then looks * for any invalid metrics. For each invalid metric, it tries to recompute the * metric if the 1 hour data is available. If the 1 hour data is available it too * is updated as necessary, and then the 6 hour metric is recomputed and persisted. * If the 1 hour data is no longer available, the 6 hour metrics is deleted. * * @param sixHourMetrics The 6 hour metrics to update * @return */ private List<AggregateNumericMetric> update6HourMetrics(final List<AggregateNumericMetric> sixHourMetrics) { List<AggregateNumericMetric> updated6HourMetrics = removeEmpty6HourMetrics(sixHourMetrics); List<AggregateNumericMetric> invalid6HourMetrics = findInvalidMetrics(updated6HourMetrics); for (AggregateNumericMetric invalid6HourMetric : invalid6HourMetrics) { List<AggregateNumericMetric> oneHourMetrics = dao.findAggregateMetrics(invalid6HourMetric.getScheduleId(), Bucket.ONE_HOUR, invalid6HourMetric.getTimestamp(), new DateTime(invalid6HourMetric.getTimestamp()).plusHours(6).getMillis()); if (oneHourMetrics.isEmpty()) { // This likely means that the 1 hour data has already expired. The best // we can do at this point is to delete the invalid metric. log.info("Deleting 6 hour metric " + invalid6HourMetric + " since the 1 hour metrics are no longer available."); updated6HourMetrics = remove6HourMetric(invalid6HourMetric, sixHourMetrics); } else { // Since we have 1 hour metrics, we want to first inspect and update // them as best we can. Then we go ahead and recompute and persist the // new 6 hour metric. List<AggregateNumericMetric> updated1HourMetrics = update1HourMetrics(oneHourMetrics); AggregateNumericMetric recomputed6HourMetric = computeAggregate(updated1HourMetrics, invalid6HourMetric.getScheduleId(), invalid6HourMetric.getTimestamp(), Bucket.SIX_HOUR); updated6HourMetrics = replace6HourMetric(invalid6HourMetric, recomputed6HourMetric, sixHourMetrics); log.info("The invalid 6 hour metric " + invalid6HourMetric + " has been recomputed with a new value of " + getValueText(recomputed6HourMetric)); } } return updated6HourMetrics; } /** * This method first looks for and deletes any empty 1 hour metrics. It then looks * for any invalid metrics. For each invalid metric, we recompute and persist the * metric if the raw data is still available; otherwise, we delete the metric. * * @param oneHourMetrics The 1 hour metrics to update * @return The updated metrics which includes only those metric that have been * persisted and not deleted. */ private List<AggregateNumericMetric> update1HourMetrics(final List<AggregateNumericMetric> oneHourMetrics) { List<AggregateNumericMetric> updated1HourMetrics = removeEmpty1HourMetrics(oneHourMetrics); List<AggregateNumericMetric> invalid1HourMetrics = findInvalidMetrics(updated1HourMetrics); for (AggregateNumericMetric invalid1HourMetric : invalid1HourMetrics) { // Try to recompute the 1 hour metric. If the raw data is gone, then we // simply delete the invalid 1 hour metric; otherwise, we persist the // recomputed aggregate. AggregateNumericMetric recomputed1HourMetric = recompute1HourAggregateIfPossible(invalid1HourMetric); if (recomputed1HourMetric == null) { log.info("Deleting 1 hour metric " + invalid1HourMetric + " since the raw data is no longer available."); updated1HourMetrics = remove1HourMetric(invalid1HourMetric, updated1HourMetrics); } else { updated1HourMetrics = replace1HourMetric(invalid1HourMetric, recomputed1HourMetric, updated1HourMetrics); log.info("The invalid 1 hour metric " + invalid1HourMetric + " has been recomputed with a new value of " + getValueText(recomputed1HourMetric)); } } return updated1HourMetrics; } /** * Filters out empty aggregate metrics and deletes them from the database. * * @param metrics The metrics to search * @return A new collection containing non-empty aggregate metrics. The original collection is not modified. */ private List<AggregateNumericMetric> removeEmpty6HourMetrics(List<AggregateNumericMetric> metrics) { List<AggregateNumericMetric> nonEmptyMetrics = new ArrayList<AggregateNumericMetric>(); for (AggregateNumericMetric metric : metrics) { if (isEmptyMetric(metric)) { dao.deleteAggregate(metric); } else { nonEmptyMetrics.add(metric); } } return nonEmptyMetrics; } /** * Filters out empty aggregate metrics and deletes them from the database. * * @param metrics The metrics to search * @return A new collection containing non-empty aggregate metrics. The original collection is not modified. */ private List<AggregateNumericMetric> removeEmpty1HourMetrics(List<AggregateNumericMetric> metrics) { List<AggregateNumericMetric> nonEmptyMetrics = new ArrayList<AggregateNumericMetric>(); for (AggregateNumericMetric metric : metrics) { if (isEmptyMetric(metric)) { dao.deleteAggregate(metric); } else { nonEmptyMetrics.add(metric); } } return nonEmptyMetrics; } private boolean isEmptyMetric(AggregateNumericMetric metric) { return metric.getMin().equals(Double.NaN) && metric.getMax().equals(Double.NaN) && metric.getAvg().equals(0d); } private List<AggregateNumericMetric> findInvalidMetrics(List<AggregateNumericMetric> metrics) { List<AggregateNumericMetric> invalidMetrics = new ArrayList<AggregateNumericMetric>(); for (AggregateNumericMetric metric : metrics) { if (isInvalidMetric(metric)) { invalidMetrics.add(metric); } } return invalidMetrics; } public boolean isInvalidMetric(AggregateNumericMetric metric) { return (metric.getMax() < metric.getAvg() && Math.abs(metric.getMax() - metric.getAvg()) > THRESHOLD) || (metric.getMin() > metric.getAvg() && Math.abs(metric.getMin() - metric.getAvg()) > THRESHOLD) || (Double.isNaN(metric.getAvg()) || Double.isNaN(metric.getMin()) || Double.isNaN(metric.getMin())); } /** * This method recomputes the 1 hour aggregate if the raw data is still available. * @param metric The original 1 hour aggregate metric * @return The new 1 hour aggregate or <code>null</code> if the raw data is no longer available */ private AggregateNumericMetric recompute1HourAggregateIfPossible(AggregateNumericMetric metric) { List<RawNumericMetric> rawMetrics = dao.findRawMetrics(metric.getScheduleId(), metric.getTimestamp(), new DateTime(metric.getTimestamp()).plusHours(1).getMillis()); if (!rawMetrics.isEmpty()) { return computeAggregateFromRaw(rawMetrics, metric); } else { return null; } } private AggregateNumericMetric computeAggregateFromRaw(List<RawNumericMetric> rawMetrics, AggregateNumericMetric metric) { double min = Double.NaN; double max = min; int count = 0; ArithmeticMeanCalculator mean = new ArithmeticMeanCalculator(); double value; for (RawNumericMetric rawMetric : rawMetrics) { value = rawMetric.getValue(); if (count == 0) { min = value; max = min; } if (value < min) { min = value; } else if (value > max) { max = value; } mean.add(value); ++count; } // We let the caller handle setting the schedule id because in some cases we do // not care about it. return new AggregateNumericMetric(metric.getScheduleId(), metric.getBucket(), mean.getArithmeticMean(), min, max, metric.getTimestamp()); } private AggregateNumericMetric computeAggregate(List<AggregateNumericMetric> metrics, int scheduleId, long timestamp, Bucket bucket) { double min = Double.NaN; double max = min; int count = 0; ArithmeticMeanCalculator mean = new ArithmeticMeanCalculator(); for (AggregateNumericMetric metric : metrics) { if (count == 0) { min = metric.getMin(); max = metric.getMax(); } if (metric.getMin() < min) { min = metric.getMin(); } if (metric.getMax() > max) { max = metric.getMax(); } mean.add(metric.getAvg()); ++count; } // We let the caller handle setting the schedule id because in some cases we do // not care about it. return new AggregateNumericMetric(scheduleId, bucket, mean.getArithmeticMean(), min, max, timestamp); } private List<AggregateNumericMetric> remove1HourMetric(AggregateNumericMetric metric, List<AggregateNumericMetric> metrics) { return removeMetric(metric, metrics); } private List<AggregateNumericMetric> remove6HourMetric(AggregateNumericMetric metric, List<AggregateNumericMetric> metrics) { return removeMetric(metric, metrics); } private void remove24HourMetric(AggregateNumericMetric metric) { dao.deleteAggregate(metric); } private List<AggregateNumericMetric> replace1HourMetric(AggregateNumericMetric metric, AggregateNumericMetric newMetric, List<AggregateNumericMetric> metrics) { return replaceMetric(metric, newMetric, metrics, Bucket.ONE_HOUR); } private List<AggregateNumericMetric> replace6HourMetric(AggregateNumericMetric metric, AggregateNumericMetric newMetric, List<AggregateNumericMetric> metrics) { return replaceMetric(metric, newMetric, metrics, Bucket.SIX_HOUR); } private List<AggregateNumericMetric> removeMetric(AggregateNumericMetric metric, List<AggregateNumericMetric> metrics) { dao.deleteAggregate(metric); List<AggregateNumericMetric> updatedMetrics = new ArrayList<AggregateNumericMetric>(metrics); updatedMetrics.remove(metric); return updatedMetrics; } private List<AggregateNumericMetric> replaceMetric(AggregateNumericMetric metric, AggregateNumericMetric newMetric, List<AggregateNumericMetric> metrics, Bucket bucket) { switch (bucket) { case ONE_HOUR: persist1HourMetric(newMetric); break; case SIX_HOUR: persist6HourMetric(newMetric); break; default: throw new IllegalArgumentException(bucket + " cannot be used for this method"); } List<AggregateNumericMetric> updatedMetrics = new ArrayList<AggregateNumericMetric>(metrics); updatedMetrics.remove(metric); updatedMetrics.add(newMetric); return updatedMetrics; } private void persist1HourMetric(AggregateNumericMetric metric) { dao.insert1HourData(metric).get(); } private void persist6HourMetric(AggregateNumericMetric metric) { dao.insert6HourData(metric).get(); } private void persist24HourMetric(AggregateNumericMetric metric) { dao.insert24HourData(metric).get(); } private String getValueText(AggregateNumericMetric metric) { return "{max: " + metric.getMax() + ", min: " + metric.getMin() + ", avg: " + metric.getAvg() + "}"; } }