/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.addthis.hydra.job.alert; import javax.annotation.Nonnull; import javax.annotation.Nullable; import java.io.IOException; import java.util.Date; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.stream.Collectors; import java.text.DecimalFormat; import java.text.SimpleDateFormat; import com.addthis.basis.net.HttpUtil; import com.addthis.basis.net.http.HttpResponse; import com.addthis.codec.jackson.Jackson; import com.addthis.codec.json.CodecJSON; import com.addthis.hydra.job.Job; import com.addthis.hydra.job.JobState; import com.addthis.hydra.job.JobTask; import com.addthis.hydra.job.JobTaskState; import com.addthis.hydra.job.spawn.Spawn; import com.addthis.hydra.job.spawn.SpawnMesh; import com.addthis.hydra.job.store.SpawnDataStore; import com.addthis.hydra.util.EmailUtil; import com.addthis.maljson.JSONArray; import com.addthis.maljson.JSONObject; import com.addthis.meshy.MeshyClient; import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.HashMultimap; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Lists; import com.google.common.collect.MapDifference; import com.google.common.collect.Maps; import com.google.common.collect.Multimaps; import com.google.common.collect.SetMultimap; import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.node.ArrayNode; import com.fasterxml.jackson.databind.node.ObjectNode; import com.typesafe.config.ConfigFactory; import org.apache.commons.lang3.StringUtils; import org.apache.http.entity.ContentType; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import static com.addthis.hydra.job.store.SpawnDataStoreKeys.SPAWN_COMMON_ALERT_PATH; /** * This class runs over the set of job alerts, sending trigger/clear emails as appropriate */ public class JobAlertRunner { private static final Logger log = LoggerFactory.getLogger(JobAlertRunner.class); private static final String clusterHead = ConfigFactory.load().getString("com.addthis.hydra.job.spawn.Spawn.httpHost"); private static final ObjectMapper objectMapper = new ObjectMapper(); private static final String meshHost = SpawnMesh.getMeshHost(); private static final int meshPort = SpawnMesh.getMeshPort(); private static final long GIGA_BYTE = (long) Math.pow(1024, 3); private static final SimpleDateFormat dateFormat = new SimpleDateFormat("yyMMdd-HHmm"); private static final DecimalFormat decimalFormat = new DecimalFormat("#.###"); private final Spawn spawn; private final SpawnDataStore spawnDataStore; private final ConcurrentHashMap<String, AbstractJobAlert> alertMap; /** * A mapping from (jobIds + aliases) to a set of alertIds. * Does not dereference aliases into their corresponding jobIds. */ private final SetMultimap<String, String> jobToAlertsMap = Multimaps.synchronizedSetMultimap(HashMultimap.create()); private MeshyClient meshyClient; private boolean alertsEnabled; private volatile boolean lastAlertScanFailed; public JobAlertRunner(Spawn spawn) { this.spawn = spawn; this.spawnDataStore = spawn.getSpawnDataStore(); try { meshyClient = new MeshyClient(meshHost, meshPort); } catch (IOException e) { log.warn("Warning: failed to instantiate job alert mesh client", e); meshyClient = null; } String alertsEnabledString = null; try { alertsEnabledString = spawnDataStore.get(SPAWN_COMMON_ALERT_PATH); } catch (Exception e) { log.warn("Unable to read alerts status due to: {}", e.getMessage()); } this.alertsEnabled = (alertsEnabledString == null) || alertsEnabledString.isEmpty() || "true".equals(alertsEnabledString); this.alertMap = new ConcurrentHashMap<>(); loadAlertMap(); } /** Disables alert scanning */ public void disableAlerts() throws Exception { spawnDataStore.put(SPAWN_COMMON_ALERT_PATH, "false"); this.alertsEnabled = false; } /** Enables alert scanning */ public void enableAlerts() throws Exception { spawnDataStore.put(SPAWN_COMMON_ALERT_PATH, "true"); this.alertsEnabled = true; } public boolean isAlertsEnabled() { return alertsEnabled; } public boolean isLastAlertScanFailed() { return lastAlertScanFailed; } /** * Iterate over alert map, checking the status of each alert and sending emails as needed. */ public void scanAlerts() { if (alertsEnabled) { log.info("Started alert scan of {} alerts...", alertMap.size()); try { for (Map.Entry<String, AbstractJobAlert> entry : alertMap.entrySet()) { AbstractJobAlert oldAlert = entry.getValue(); Map<String, String> currentErrors = oldAlert.getActiveJobs(); // entry may be concurrently deleted, so only recompute if still present, and while locked AbstractJobAlert alert = alertMap.computeIfPresent(entry.getKey(), (id, currentAlert) -> { currentAlert.checkAlertForJobs(spawn, meshyClient); if (!currentAlert.getActiveJobs().equals(currentErrors)) { storeAlert(currentAlert.alertId, currentAlert); } return currentAlert; }); // null if it was concurrently removed from the map. Does not catch all removals, but might as well // make a best effort attempt to send clears when convenient (should probably move clear emails to // the removal method at some point) if (alert == null) { sendAlert(oldAlert, "[CLEAR] ", currentErrors); } else { Map<String, String> newErrors = alert.getActiveJobs(); MapDifference<String, String> difference = Maps.difference(currentErrors, newErrors); sendAlert(oldAlert, "[CLEAR] ", difference.entriesOnlyOnLeft()); sendAlert(alert, "[TRIGGER] ", difference.entriesOnlyOnRight()); Map<String, String> errorsChanged = new HashMap<>(); for (Map.Entry<String, MapDifference.ValueDifference<String>> differing : difference.entriesDiffering().entrySet()) { String oldValue = differing.getValue().leftValue(); String newValue = differing.getValue().rightValue(); if (!alert.suppressChanges.suppress(oldValue, newValue)) { errorsChanged.put(differing.getKey(), newValue); } } sendAlert(alert, "[ERROR CHANGED] ", errorsChanged); } } lastAlertScanFailed = false; log.info("Finished alert scan"); } catch (Exception e) { lastAlertScanFailed = true; log.error("Unexpected error while scanning alerts: {}", e.getMessage(), e); } } } private static String emailSummary(Job job) { long files = 0; double bytes = 0; int running = 0; int errored = 0; int done = 0; int numNodes = 0; StringBuffer sb = new StringBuffer(); if (job != null) { List<JobTask> jobNodes = job.getCopyOfTasks(); if (jobNodes != null) { numNodes = jobNodes.size(); for (JobTask task : jobNodes) { files += task.getFileCount(); bytes += task.getByteCount(); if (!task.getState().equals(JobTaskState.IDLE)) { running++; } switch (task.getState()) { case IDLE: done++; break; case ERROR: done++; errored++; break; default: break; } } } sb.append("Cluster : " + clusterHead + "\n"); sb.append("Job : " + job.getId() + "\n"); sb.append("Job Link : http://" + clusterHead + ":5052/spawn2/index.html#jobs/" + job.getId() + "/tasks\n"); sb.append("Description : " + job.getDescription() + "\n"); sb.append("------------------------------ \n"); sb.append("Task Summary \n"); sb.append("------------------------------ \n"); sb.append("Job State : " + job.getState() + "\n"); sb.append("Start Time : " + format(job.getStartTime()) + "\n"); sb.append("End Time : " + format(job.getEndTime()) + "\n"); sb.append("Num Nodes : " + numNodes + "\n"); sb.append("Running Nodes : " + running + "\n"); sb.append("Errored Nodes : " + errored + "\n"); sb.append("Done Nodes : " + done + "\n"); sb.append("Task files : " + files + "\n"); sb.append("Task Bytes : " + format(bytes) + " GB\n"); sb.append("------------------------------ \n"); } return sb.toString(); } private static String format(double bytes) { double gb = bytes / GIGA_BYTE; return decimalFormat.format(gb); } private static String format(Long time) { if (time != null) { return dateFormat.format(new Date(time)); } else { return "-"; } } private void sendAlert(AbstractJobAlert jobAlert, String reason, Map<String, String> errors) { if (errors.isEmpty()) { return; } String alertLink = String.format("http://%s:5052/spawn2/index.html#alerts/%s", clusterHead, jobAlert.alertId); log.info("Alerting {} :: jobs : {} : {}", jobAlert.alertId, errors.keySet(), reason); if (StringUtils.isNotBlank(jobAlert.email)) { sendEmailAlert(jobAlert, alertLink, reason, errors); } if (StringUtils.isNotBlank(jobAlert.webhookURL)) { sendWebhookAlert(jobAlert, alertLink, reason, errors); } } @VisibleForTesting static AlertWebhookRequest getWebhookObject(Spawn spawn, AbstractJobAlert jobAlert, String alertLink, String reason, Map<String, String> errors) { // Turn all the jobs in error into a list of information about each job AlertWebhookRequest webhookRequest = new AlertWebhookRequest(); webhookRequest.setAlertType(jobAlert.getTypeString()); webhookRequest.setAlertLink(alertLink); webhookRequest.setAlertReason(reason.trim()); webhookRequest.setAlertDescription(jobAlert.description); errors.forEach((jobUUID, errMsg) -> { JobError jobError = new JobError(); jobError.setId(jobUUID); jobError.setError(errMsg); jobError.setClusterHead(clusterHead); Job job = spawn.getJob(jobUUID); if (job != null) { jobError.setJobState(job.getState()); jobError.setDescription(job.getDescription()); if (job.getStartTime() != null) { jobError.setStartTime(job.getStartTime()); } if (job.getEndTime() != null) { jobError.setEndTime(job.getEndTime()); } List<JobTask> jobTasks = job.getCopyOfTasks(); jobError.setNodeCount(jobTasks.size()); jobError.setErrorCount((int) jobTasks.stream().filter(t -> t.getState() == JobTaskState.ERROR).count()); } webhookRequest.getJobsInError().add(jobError); }); return webhookRequest; } private void sendWebhookAlert(AbstractJobAlert jobAlert, String alertLink, String reason, Map<String, String> errors) { try { byte[] body = objectMapper.writeValueAsBytes(getWebhookObject(spawn, jobAlert, alertLink, reason, errors)); HttpResponse response = HttpUtil.httpPost(jobAlert.webhookURL, ContentType.APPLICATION_JSON.getMimeType(), body, 5_000); if (response.getStatus() >= 300) { log.error("non-200 status code received for webhook alert for alert {}", jobAlert.alertId); } } catch (IOException ex) { log.error("unable to send webhook alert for alert {}", jobAlert.alertId, ex); } } /** * Send an email when an alert fires or clears. * * @param jobAlert The alert to modify */ private void sendEmailAlert(AbstractJobAlert jobAlert, String alertLink, String reason, Map<String, String> errors) { String description = jobAlert.description; boolean blankDescription = StringUtils.isBlank(description); final String shortDescription; if (blankDescription) { shortDescription = errors.keySet().toString(); } else { shortDescription = description.split("\n")[0]; } String subject = String.format("%s %s - %s", reason, jobAlert.getTypeString(), shortDescription); StringBuilder sb = new StringBuilder(reason + ' ' + jobAlert.getTypeString() + '\n'); sb.append("Alert link : ").append(alertLink).append('\n'); if (!blankDescription) { sb.append("Alert Description : ").append(description).append('\n'); } for (Map.Entry<String, String> entry : errors.entrySet()) { sb.append(emailSummary(spawn.getJob(entry.getKey()))).append('\n'); sb.append("Error Message\n"); sb.append(entry.getValue()); sb.append("\n------------------------------\n"); } if (!EmailUtil.email(jobAlert.email, subject, sb.toString())) { log.error("Unable to send email for alert {}", jobAlert.alertId); } } private void loadAlertMap() { Map<String, String> alertsRaw = spawnDataStore.getAllChildren(SPAWN_COMMON_ALERT_PATH); for (Map.Entry<String, String> entry : alertsRaw.entrySet()) { // Underscores are used to mark meta-information (for now, whether we have loaded legacy alerts.) if (!entry.getKey().startsWith("_")) { loadAlert(entry.getKey(), entry.getValue()); } } log.info("{} alerts loaded", alertMap.size()); } private void loadAlert(String id, String raw) { try { AbstractJobAlert jobAlert = CodecJSON.decodeString(AbstractJobAlert.class, raw); alertMap.put(id, jobAlert); updateJobToAlertsMap(id, null, jobAlert); } catch (Exception ex) { log.error("Failed to decode JobAlert id={} raw={}", id, raw, ex); } } /** * Remove any outdated mappings from a (job + alias) to an alert and * insert new mappings. If {@code old} is null then do not remove * any mappings. If {@code alert} is null then do not insert any mappings. * * @param id alertId * @param old if non-null then remove associations * @param alert if non-null then insert associations */ private void updateJobToAlertsMap(@Nonnull String id, @Nullable AbstractJobAlert old, @Nullable AbstractJobAlert alert) { if (old != null) { for (String jobId : old.jobIds) { jobToAlertsMap.remove(jobId, id); } } if (alert != null) { for (String jobId : alert.jobIds) { jobToAlertsMap.put(jobId, id); } } } public void putAlert(String id, AbstractJobAlert alert) { alertMap.compute(id, (key, old) -> { if (old != null) { alert.setStateFrom(old); } updateJobToAlertsMap(id, old, alert); storeAlert(id, alert); return alert; }); } public void removeAlert(String id) { if (id != null) { alertMap.computeIfPresent(id, (key, value) -> { updateJobToAlertsMap(id, value, null); storeAlert(id, null); return null; }); } } private void storeAlert(String alertId, @Nullable AbstractJobAlert alert) { try { if (alert != null) { spawnDataStore.putAsChild(SPAWN_COMMON_ALERT_PATH, alertId, CodecJSON.encodeString(alert)); } else { spawnDataStore.deleteChild(SPAWN_COMMON_ALERT_PATH, alertId); } } catch (Exception e) { log.warn("Warning: failed to save alert id={} alert={}", alertId, alert); } } /** * Get a snapshot of the alert map as an array, mainly for rendering in the UI. * * @return A JSONObject representation of all existing alerts */ public JSONArray getAlertStateArray() { JSONArray rv = new JSONArray(); for (AbstractJobAlert jobAlert : alertMap.values()) { try { rv.put(jobAlert.toJSON()); } catch (Exception e) { log.warn("Warning: failed to send alert in array: {}", jobAlert); } } return rv; } public JSONObject getAlertStateMap() { JSONObject rv = new JSONObject(); for (AbstractJobAlert jobAlert : alertMap.values()) { try { rv.put(jobAlert.alertId, jobAlert.toJSON()); } catch (Exception e) { log.warn("Warning: failed to send alert in map: {}", jobAlert); } } return rv; } public String getAlert(String alertId) { try { AbstractJobAlert alert = alertMap.get(alertId); if (alert == null) { return null; } else { return alert.toJSON().toString(); } } catch (Exception e) { log.warn("Failed to fetch alert {}", alertId, e); return null; } } /** Copy and then modify an alert by removing a specific job id. */ private AbstractJobAlert copyWithoutJobId(@Nonnull String jobId, AbstractJobAlert old) { ObjectNode json = Jackson.defaultMapper().valueToTree(old); ArrayNode jsonArray = json.putArray("jobIds"); old.jobIds.stream().filter(x -> !x.equals(jobId)).forEach(jsonArray::add); try { return Jackson.defaultMapper().treeToValue(json, AbstractJobAlert.class); } catch (IOException ex) { log.error("Internal error removing job alerts:", ex); return old; } } /** * Remove {@code jobId} from all alerts that are monitoring it. Delete alerts that are only monitoring this job. */ public void removeAlertsForJob(String jobId) { Set<String> alertIds = ImmutableSet.copyOf(jobToAlertsMap.get(jobId)); for (String mappedAlertId : alertIds) { if (alertMap.computeIfPresent(mappedAlertId, (alertId, alert) -> { ImmutableList<String> jobIds = alert.jobIds; if (jobIds.contains(jobId)) { @Nullable AbstractJobAlert newAlert; if (jobIds.size() == 1) { newAlert = null; } else { newAlert = copyWithoutJobId(jobId, alert); } updateJobToAlertsMap(alertId, alert, newAlert); storeAlert(alertId, newAlert); return newAlert; } else { log.warn("jobToAlertsMap has mapping from job {} to alert {} but alert has no reference to job", jobId, alertId); return alert; } }) == null) { log.warn("jobToAlertsMap has mapping from job {} to alert {} but alert does not exist", jobId, mappedAlertId); } } } /** * Returns alerts for the given job id. Does not look up aliases for a job id. If job id is an alias, will * return any alerts that are configured on the alias, but will not look up alerts on the actual job id. */ public Set<AbstractJobAlert> getAlertsForJob(String jobId) { Set<String> alertIds = ImmutableSet.copyOf(jobToAlertsMap.get(jobId)); return alertIds.stream().map(alertMap::get).collect(Collectors.toSet()); } @VisibleForTesting static class AlertWebhookRequest { private String alertDescription; private String alertLink; private String alertType; private String alertReason; private List<JobError> jobsInError = Lists.newArrayList(); public AlertWebhookRequest() { } @JsonProperty("alert_description") public String getAlertDescription() { return alertDescription; } public void setAlertDescription(String alertDescription) { this.alertDescription = alertDescription; } @JsonProperty("alert_link") public String getAlertLink() { return alertLink; } public void setAlertLink(String alertLink) { this.alertLink = alertLink; } @JsonProperty("alert_type") public String getAlertType() { return alertType; } public void setAlertType(String alertType) { this.alertType = alertType; } @JsonProperty("alert_reason") public String getAlertReason() { return alertReason; } public void setAlertReason(String alertReason) { this.alertReason = alertReason; } @JsonProperty("jobs_in_error") public List<JobError> getJobsInError() { return jobsInError; } public void setJobsInError(List<JobError> jobsInError) { this.jobsInError = jobsInError; } } @VisibleForTesting static class JobError { private String id; private String description; private String clusterHead; private String error; private JobState jobState; private long startTime; private long endTime; private int nodeCount; private int errorCount; public JobError() { } public String getId() { return id; } public void setId(String id) { this.id = id; } public String getDescription() { return description; } public void setDescription(String description) { this.description = description; } @JsonProperty("cluster_head") public String getClusterHead() { return clusterHead; } public void setClusterHead(String clusterHead) { this.clusterHead = clusterHead; } public String getError() { return error; } public void setError(String error) { this.error = error; } @JsonProperty("job_state") public JobState getJobState() { return jobState; } public void setJobState(JobState jobState) { this.jobState = jobState; } @JsonProperty("start_time") public long getStartTime() { return startTime; } public void setStartTime(long startTime) { this.startTime = startTime; } @JsonProperty("end_time") public long getEndTime() { return endTime; } public void setEndTime(long endTime) { this.endTime = endTime; } @JsonProperty("node_count") public int getNodeCount() { return nodeCount; } public void setNodeCount(int nodeCount) { this.nodeCount = nodeCount; } @JsonProperty("error_count") public int getErrorCount() { return errorCount; } public void setErrorCount(int errorCount) { this.errorCount = errorCount; } } }