/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.addthis.hydra.job.alert; import javax.annotation.Nonnull; import javax.annotation.Nullable; import java.net.ConnectException; import java.net.SocketTimeoutException; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.UUID; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import com.addthis.basis.util.Parameter; import com.addthis.codec.annotations.Pluggable; import com.addthis.codec.annotations.Time; import com.addthis.codec.codables.Codable; import com.addthis.codec.json.CodecJSON; import com.addthis.hydra.job.Job; import com.addthis.hydra.job.spawn.Spawn; import com.addthis.maljson.JSONObject; import com.addthis.meshy.MeshyClient; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Throwables; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonIgnoreProperties; import com.fasterxml.jackson.annotation.JsonProperty; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * A job alert monitors for specific conditions in the state * of one or more hydra jobs. When the condition is met the * job alert sends an email to the specified recipients and/or to a webhook. * Example conditions are: a job has errored, a job has kicked, * the output of a job consists at least X files or bytes, * the output of a job follows a specified format, etc. * * @user-reference * @hydra-category Job Alerts * @hydra-doc-position 11 * @exclude-fields activeJobs, activeTriggerTimes, lastAlertTime */ @Pluggable("job alerts") @JsonIgnoreProperties({"alertStatus", "canaryOutputMessage"}) public abstract class AbstractJobAlert implements Codable { private static final Logger log = LoggerFactory.getLogger(AbstractJobAlert.class); /** * This value will be interpreted as "scan through all the jobs in the cluster". * It must appear at the only value in the {@code jobIds} field. */ private static final String WILDCARD_JOB_STRING = "*"; /** * How many jobs should be scanned in one iteration of a wildcard job string. */ private static final int WILDCARD_BATCH_SIZE = Parameter.intValue("spawn.alert.batchSize", 50); /** Trigger alert if number of consecutive canary check exception is >= this limit */ @VisibleForTesting protected static final int MAX_CONSECUTIVE_CANARY_EXCEPTION = 10; @Nonnull @JsonProperty public final String alertId; /** * Human-readable description of the alert. */ @JsonProperty public final String description; /** * Optionally specify the number of minutes for the alert * to be continuously firing before sending an email or webhook. Can * be used to suppress intermittent alerts. */ @JsonProperty public final long delay; /** * List of email recipients. */ @JsonProperty public final String email; /** * Webhook URL to receive alert event */ @JsonProperty public final String webhookURL; /** * List of job identifiers. */ @JsonProperty public final ImmutableList<String> jobIds; /** * If true then rebroadcast the previous error message when * an error is detected at the current iteration. Default is false. * This is a trade-off where the most recent message for an alert * is not emailed with the advantage of no continuous spam of emails. */ @Nonnull @JsonProperty public final SuppressChanges suppressChanges; /** * If true, this alert was automatically generated by hydra * as part of the "Easy hydra alerts" config on a job */ @JsonProperty public final AutoGenerated autoGenerated; /* Map storing {job id : error description} for all alerted jobs the last time this alert was checked */ @JsonProperty protected volatile ImmutableMap<String, String> activeJobs; /* Map storing {job id : trigger time} for all triggering jobs the last time this alert was checked */ @JsonProperty protected volatile ImmutableMap<String, Long> activeTriggerTimes; // does not distinguish between multiple jobs, and racey wrt activeJobs, but only used for web-ui code for humans @JsonProperty protected volatile long lastAlertTime; /** Running count of consecutive canary query exceptions. Reset on success. */ protected final transient AtomicInteger consecutiveCanaryExceptionCount; private transient Iterator<Job> streamingIterator; private static <K, V> ImmutableMap<K, V> immutableOrEmpty(Map<K, V> input) { if (input == null) { return ImmutableMap.of(); } else { return ImmutableMap.copyOf(input); } } protected AbstractJobAlert(@Nullable String alertId, String description, @Time(TimeUnit.MINUTES) long delay, String email, String webhookURL, List<String> jobIds, SuppressChanges suppressChanges, AutoGenerated autoGenerated, long lastAlertTime, Map<String, Long> activeTriggerTimes, Map<String, String> activeJobs) { if (alertId == null) { String newAlertId = UUID.randomUUID().toString(); log.debug("creating new alert with uuid: {}", newAlertId); this.alertId = newAlertId; } else { this.alertId = alertId; } this.description = description; this.delay = delay; this.email = email; this.webhookURL = webhookURL; this.jobIds = ImmutableList.copyOf(jobIds); this.suppressChanges = suppressChanges; this.autoGenerated = autoGenerated; this.activeJobs = immutableOrEmpty(activeJobs); this.activeTriggerTimes = immutableOrEmpty(activeTriggerTimes); this.lastAlertTime = lastAlertTime; this.streamingIterator = null; this.consecutiveCanaryExceptionCount = new AtomicInteger(0); } // getters/setters that trigger ser/deser and are not vanilla (also have in-code usages) public Map<String, String> getActiveJobs() { return activeJobs; } /** Load state from an existing alert. The provided source alert should not be concurrently modified. */ public void setStateFrom(AbstractJobAlert sourceAlert) { this.lastAlertTime = sourceAlert.lastAlertTime; this.activeJobs = sourceAlert.activeJobs; this.activeTriggerTimes = sourceAlert.activeTriggerTimes; } // used by the ui/ web code @Deprecated public JSONObject toJSON() throws Exception { return CodecJSON.encodeJSON(this); } public ImmutableMap<String, String> checkAlertForJobs(Spawn spawn, MeshyClient meshyClient) { Set<Job> jobs = getAlertJobs(spawn); long now = System.currentTimeMillis(); long delayMillis = TimeUnit.MINUTES.toMillis(delay); ImmutableMap.Builder<String, String> newActiveJobsBuilder = new ImmutableMap.Builder<>(); ImmutableMap.Builder<String, Long> newActiveTriggerTimesBuilder = new ImmutableMap.Builder<>(); for (Job job : jobs) { long triggerTime = activeTriggerTimes.getOrDefault(job.getId(), now); String previousErrorMessage = activeJobs.get(job.getId()); // only interesting for certain edge cases String errorMessage = alertActiveForJob(meshyClient, job, previousErrorMessage); if (errorMessage != null) { newActiveTriggerTimesBuilder.put(job.getId(), triggerTime); if ((now - triggerTime) >= delayMillis) { newActiveJobsBuilder.put(job.getId(), errorMessage); } } } this.activeTriggerTimes = newActiveTriggerTimesBuilder.build(); this.activeJobs = newActiveJobsBuilder.build(); if (activeTriggerTimes.isEmpty()) { lastAlertTime = 0; } else if (lastAlertTime <= 0) { lastAlertTime = System.currentTimeMillis(); } return activeJobs; } @JsonIgnore public abstract String getTypeString(); @VisibleForTesting @Nullable final String alertActiveForJob(@Nullable MeshyClient meshClient, Job job, String previousErrorMessage) { String validationError = isValid(); if (validationError != null) { return validationError; } return testAlertActiveForJob(meshClient, job, previousErrorMessage); } @VisibleForTesting @Nullable protected abstract String testAlertActiveForJob(@Nullable MeshyClient meshClient, Job job, String previousErrorMessage); @VisibleForTesting @Nullable protected String handleCanaryException(Exception ex, @Nullable String previousErrorMessage) { log.warn("Exception during canary check for alert {} : ", alertId, ex); // special handling for transient exceptions due to query system down or busy Throwable rootCause = Throwables.getRootCause(ex); if ((rootCause instanceof SocketTimeoutException) || (rootCause instanceof ConnectException)) { int c = consecutiveCanaryExceptionCount.incrementAndGet(); if (c >= MAX_CONSECUTIVE_CANARY_EXCEPTION) { consecutiveCanaryExceptionCount.set(0); return "Canary check threw exception at least " + MAX_CONSECUTIVE_CANARY_EXCEPTION + " times in a row. " + "The most recent error is: " + rootCause; } else { return previousErrorMessage; } } return rootCause.toString(); } /** * Returns either a message indicating an error with the configuration * or null if the configuration is valid. * * @return null if configuration is valid. */ @JsonIgnore public abstract String isValid(); @Nonnull private Set<Job> getAlertJobs(Spawn spawn) { if (jobIds != null) { if (jobIds.size() == 1 && jobIds.get(0).equals(WILDCARD_JOB_STRING)) { return streamingJobSet(spawn); } else { return discreteJobSet(spawn); } } else { return ImmutableSet.of(); } } @Nonnull private Set<Job> discreteJobSet(Spawn spawn) { Set<Job> rv = new HashSet<>(); Map<String, List<String>> aliases = spawn.getAliasManager().getAliases(); for (String lookupId : jobIds) { Job job = spawn.getJob(lookupId); if (job != null) { rv.add(job); } else if (aliases.containsKey(lookupId)) { for (String jobId : aliases.get(lookupId)) { job = spawn.getJob(jobId); if (job != null) { rv.add(job); } } } } return rv; } @Nonnull private Set<Job> streamingJobSet(Spawn spawn) { Set<Job> rv = new HashSet<>(); if (streamingIterator == null) { streamingIterator = spawn.getSpawnState().jobsIterator(); } while (rv.size() < WILDCARD_BATCH_SIZE) { if (streamingIterator.hasNext()) { rv.add(streamingIterator.next()); } else { streamingIterator = null; break; } } for (String lookupId : activeJobs.keySet()) { Job job = spawn.getJob(lookupId); if (job != null) { rv.add(job); } } return rv; } @Override public String toString() { try { return CodecJSON.encodeString(this); } catch (Exception ignored) { return super.toString(); } } }