/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.addthis.hydra.job.alert.types; import javax.annotation.Nullable; import java.util.List; import java.util.Map; import java.util.concurrent.TimeUnit; import com.addthis.codec.annotations.Time; import com.addthis.hydra.job.Job; import com.addthis.hydra.job.alert.AbstractJobAlert; import com.addthis.hydra.job.alert.AutoGenerated; import com.addthis.hydra.job.alert.JobAlertUtil; import com.addthis.hydra.job.alert.SuppressChanges; import com.addthis.meshy.MeshyClient; import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonIgnoreProperties; import com.fasterxml.jackson.annotation.JsonProperty; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * This {@link AbstractJobAlert JobAlert} <span class="hydra-summary">alerts on number of files generated</span>. * <p> * If a value for a single host is above or below the threshold from the * mean value then raise an alert. The treshold is calculated as the * maximum of {@code tolerance} and ({@code sigma} multiplied by the * standard deviation). * * @user-reference */ @JsonIgnoreProperties(ignoreUnknown = true) public class FileCountJobAlert extends AbstractJobAlert { private static final Logger log = LoggerFactory.getLogger(FileCountJobAlert.class); /** * Number of standard deviations away from the mean for an alert * to trigger. Default is 1.0. */ @JsonProperty public final double sigma; /** * Absolute delta in number of files that are tolerated * before an alert triggers. Default is 0.0. */ @JsonProperty public final int tolerance; /** * Path to the files that should be monitored. * Use glob expansion (wildcards) to match against multiple files. */ @JsonProperty public final String canaryPath; public FileCountJobAlert(@Nullable @JsonProperty("alertId") String alertId, @JsonProperty("description") String description, @Time(TimeUnit.MINUTES) @JsonProperty("delay") long delay, @JsonProperty("email") String email, @JsonProperty("webhookURL") String webhookURL, @JsonProperty(value = "jobIds", required = true) List<String> jobIds, @JsonProperty("suppressChanges") SuppressChanges suppressChanges, @JsonProperty("autoGenerated") AutoGenerated autoGenerated, @JsonProperty("lastAlertTime") long lastAlertTime, @JsonProperty("activeJobs") Map<String, String> activeJobs, @JsonProperty("activeTriggerTimes") Map<String, Long> activeTriggerTimes, @JsonProperty("sigma") double sigma, @JsonProperty("tolerance") int tolerance, @JsonProperty("canaryPath") String canaryPath) { super(alertId, description, delay, email, webhookURL, jobIds, suppressChanges, autoGenerated, lastAlertTime, activeTriggerTimes, activeJobs); this.sigma = sigma; this.tolerance = tolerance; this.canaryPath = canaryPath; } @JsonIgnore @Override public String getTypeString() { return "Discrepancy in counts of log files across tasks"; } private static final String ERROR_MESSAGE = "Host %s has %d log files which is %s than threshold %f" + " derived as mean value %f %s Math.max(%f multiplied by the" + " standard deviation %f, %d).\n"; @Nullable @Override protected String testAlertActiveForJob(@Nullable MeshyClient meshClient, Job job, String previousErrorMessage) { Map<String, Integer> logCounts = JobAlertUtil.getFileCountPerTask(meshClient, job.getId(), canaryPath); log.debug("Log count map is {}", logCounts); if (logCounts.size() < 2) { return null; } StringBuilder errors = new StringBuilder(); double mean = 0.0; double m2 = 0.0; int index = 0; for (Integer logCount : logCounts.values()) { index++; double delta = logCount - mean; mean += delta / index; m2 += delta * (logCount - mean); } double stddev = Math.sqrt(m2 / logCounts.size()); log.debug("Mean is {} stddev is {}", mean, stddev); for (Map.Entry<String, Integer> entry : logCounts.entrySet()) { String hostUUID = entry.getKey(); Integer logCount = entry.getValue(); /** * The first and second conditions should never both be true. * But we test them independently to catch this illegal state. */ double threshold = Math.max(sigma * stddev, tolerance); if (logCount < (mean - threshold)) { errors.append(String.format(ERROR_MESSAGE, hostUUID, logCount, "<", (mean - threshold), mean, "minus", sigma, stddev, tolerance)); } if (logCount > (mean + threshold)) { errors.append(String.format(ERROR_MESSAGE, hostUUID, logCount, ">", (mean + threshold), mean, "plus", sigma, stddev, tolerance)); } } String errorString = errors.toString(); if (!errorString.isEmpty()) { return errorString; } else { return null; } } @Nullable @Override public String isValid() { if (sigma <= 0.0) { return "sigma parameter must be a positive value"; } if (tolerance < 0) { return "tolerance parameter must be a non-negative value"; } return null; } }