/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.addthis.hydra.job.alert; import javax.annotation.Nonnull; import javax.annotation.Nullable; import java.util.Set; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import java.util.function.Consumer; import com.addthis.basis.util.Parameter; import com.addthis.hydra.job.IJob; import com.addthis.hydra.job.alert.types.OnErrorJobAlert; import com.addthis.hydra.job.alert.types.RekickTimeoutJobAlert; import com.addthis.hydra.job.alert.types.RuntimeExceededJobAlert; import com.addthis.maljson.JSONArray; import com.addthis.maljson.JSONObject; import com.google.common.collect.ImmutableList; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import static com.addthis.hydra.job.alert.AutoGenerated.BASIC_ALERT; import static com.addthis.hydra.job.alert.AutoGenerated.BASIC_PAGE; public class JobAlertManagerImpl implements JobAlertManager { private static final Logger log = LoggerFactory.getLogger(JobAlertManagerImpl.class); private static final long ALERT_REPEAT_MILLIS = Parameter.longValue("spawn.job.alert.repeat", 5 * 60 * 1000); private static final long ALERT_DELAY_MILLIS = Parameter.longValue("spawn.job.alert.delay", 60 * 1000); private enum AlertAction {CREATE_OR_UPDATE, DELETE, NO_OP} private static final long MAX_RUNTIME = Parameter.longValue("alert.auto.runtime.max", 480); private static final long RUNTIME_PADDING = Parameter.longValue("alert.auto.runtime.padding", 30); private static final long REKICK_PADDING = Parameter.longValue("alert.auto.rekick.padding", 30); private static final String EXTRA_DESCRIPTION = "\nThis alert was automatically generated by Hydra. Use job settings (Basic Hydra Alerts/Pagerduty) to " + "manage. Do not modify directly."; private final JobAlertRunner jobAlertRunner; private final GroupManager groupManager; public JobAlertManagerImpl( GroupManager groupManager, JobAlertRunner jobAlertRunner, ScheduledExecutorService scheduledExecutorService) { this.jobAlertRunner = jobAlertRunner; this.groupManager = groupManager; this.scheduleAlertScan(scheduledExecutorService); } private void scheduleAlertScan(ScheduledExecutorService scheduledExecutorService) { if (scheduledExecutorService != null) { scheduledExecutorService.scheduleWithFixedDelay(jobAlertRunner::scanAlerts, ALERT_DELAY_MILLIS, ALERT_REPEAT_MILLIS, TimeUnit.MILLISECONDS); log.info("Alert scan scheduled: delay={}s, repeat={}s", ALERT_DELAY_MILLIS / 1000, ALERT_REPEAT_MILLIS / 1000); } else { log.warn("ScheduledExecutorService is not provided. Alert scan is disabled"); } } public void disableAlerts() throws Exception { this.jobAlertRunner.disableAlerts(); } public void enableAlerts() throws Exception { this.jobAlertRunner.enableAlerts(); } @Override public boolean isAlertEnabledAndWorking() { return jobAlertRunner.isAlertsEnabled() && !jobAlertRunner.isLastAlertScanFailed(); } public void putAlert(String alertId, AbstractJobAlert alert) { jobAlertRunner.putAlert(alertId, alert); } public void removeAlert(String alertId) { jobAlertRunner.removeAlert(alertId); } public void removeAlertsForJob(String jobId) { jobAlertRunner.removeAlertsForJob(jobId); } public JSONArray fetchAllAlertsArray() { return jobAlertRunner.getAlertStateArray(); } public JSONObject fetchAllAlertsMap() { return jobAlertRunner.getAlertStateMap(); } public String getAlert(String alertId) { return jobAlertRunner.getAlert(alertId); } /** * Create or update auto generated alerts on the job * * @param job the job being updated * @param basicAlerts true if we want basic alerts * @param basicPages true if we want basic pages */ @Override public void updateBasicAlerts(final IJob job, final boolean basicAlerts, final boolean basicPages) { @Nullable final Group group = this.groupManager.getGroup(job.getGroup()); // don't try to create alerts if config is not set up for the user/group if (group == null) { log.warn("No group '{}' found for job {}. Unable to create alerts without config.", job.getGroup(), job.getId()); return; } final Set<AbstractJobAlert> alerts = this.jobAlertRunner.getAlertsForJob(job.getId()); BasicAlerts existingBasicAlerts = BasicAlerts.create(alerts, BASIC_ALERT); BasicAlerts existingBasicPages = BasicAlerts.create(alerts, BASIC_PAGE); this.updateBasicAlert( existingBasicAlerts, job, basicAlerts, job.getBasicAlerts(), BASIC_ALERT, group.email, group.webhookURL, job::setBasicAlerts ); this.updateBasicAlert( existingBasicPages, job, basicPages, job.getBasicPages(), BASIC_PAGE, group.pagerEmail, null, job::setBasicPages ); } /** * Create or update one type of auto generated alerts * * @param existingAlerts any alerts that already exist * @param job the job being updated * @param alertsShouldExist true if alerts are wanted * @param alertsDoExist true if alerts were wanted before this update * @param autoGenerated alert or page * @param email for the alert * @param webhookURL for the alert * @param saveAlertFn function that updates the alert setting on the job */ @SuppressWarnings("MethodWithTooManyParameters") private void updateBasicAlert( BasicAlerts existingAlerts, IJob job, boolean alertsShouldExist, boolean alertsDoExist, @Nullable AutoGenerated autoGenerated, @Nullable String email, @Nullable String webhookURL, Consumer<Boolean> saveAlertFn) { // only create these alerts if there is a config if (email != null) { // actually save the job config part saveAlertFn.accept(alertsShouldExist); AlertAction action = JobAlertManagerImpl.determineAction(alertsShouldExist, alertsDoExist); // this will be null if there aren't existing alerts if (action == AlertAction.DELETE) { existingAlerts.forEach(alert -> this.removeAlert(alert.alertId)); } else if (action == AlertAction.CREATE_OR_UPDATE) { this.createAlerts(existingAlerts, autoGenerated, job, email, webhookURL); } } } private static AlertAction determineAction(boolean shouldExist, boolean exists) { if (shouldExist) { return AlertAction.CREATE_OR_UPDATE; } else if (exists) { return AlertAction.DELETE; } return AlertAction.NO_OP; } private void createAlerts( @Nonnull BasicAlerts existingAlerts, @Nullable AutoGenerated autoGenerated, @Nonnull IJob job, @Nullable String email, @Nullable String webhookURL) { String jobId = job.getId(); String description = job.getDescription() + EXTRA_DESCRIPTION; String errorId = (existingAlerts.getErrorAlert() == null) ? null : existingAlerts.getErrorAlert().alertId; String rekickId = (existingAlerts.getRekickAlert() == null) ? null : existingAlerts.getRekickAlert().alertId; String runtimeId = (existingAlerts.getRuntimeAlert() == null) ? null : existingAlerts.getRuntimeAlert().alertId; OnErrorJobAlert error = new OnErrorJobAlert(errorId, description, 0, email, webhookURL, ImmutableList.of(jobId), SuppressChanges.FALSE, autoGenerated, 0, null, null ); this.putAlert(error.alertId, error); Long rekick = job.getRekickTimeout(); if ((rekick != null) && (rekick > 0)) { long rekickTimeout = rekick + REKICK_PADDING; RekickTimeoutJobAlert rekickAlert = new RekickTimeoutJobAlert(rekickId, description, rekickTimeout, 0, email, webhookURL, ImmutableList.of(jobId), SuppressChanges.FALSE, autoGenerated, 0, null, null ); this.putAlert(rekickAlert.alertId, rekickAlert); } else if (rekickId != null) { this.removeAlert(rekickId); } Long runtime = job.getMaxRunTime(); if ((runtime != null) && (runtime > 0)) { long runtimeTimeout = JobAlertManagerImpl.calculateRuntimeTimeout(job.getTaskCount(), runtime, job.getMaxSimulRunning() ); RuntimeExceededJobAlert runtimeAlert = new RuntimeExceededJobAlert(runtimeId, description, runtimeTimeout, 0, email, webhookURL, ImmutableList.of(jobId), SuppressChanges.FALSE, autoGenerated, 0, null, null ); this.putAlert(runtimeAlert.alertId, runtimeAlert); } else if (runtimeId != null) { this.removeAlert(runtimeId); } } /** * Calculates the maximum time a job probably should run, then fudges it a bit. */ private static long calculateRuntimeTimeout(int tasks, long maxRuntime, int maxSimul) { double taskMultiplier = 1; if (maxSimul > 0) { taskMultiplier = Math.ceil((double) tasks / (double) maxSimul); } long timeout = (long) (((double) maxRuntime * taskMultiplier) + ((double) RUNTIME_PADDING * taskMultiplier)); if (timeout > MAX_RUNTIME) { return MAX_RUNTIME; } return timeout; } }