/*************************GO-LICENSE-START********************************* * Copyright 2014 ThoughtWorks, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *************************GO-LICENSE-END***********************************/ package com.thoughtworks.go.server.service; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; import com.thoughtworks.go.domain.JobIdentifier; import com.thoughtworks.go.domain.JobInstance; import com.thoughtworks.go.server.domain.JobStatusListener; import com.thoughtworks.go.serverhealth.HealthStateScope; import com.thoughtworks.go.serverhealth.HealthStateType; import com.thoughtworks.go.serverhealth.ServerHealthService; import com.thoughtworks.go.serverhealth.ServerHealthState; import com.thoughtworks.go.util.SystemEnvironment; import com.thoughtworks.go.util.TimeProvider; import org.apache.log4j.Logger; import org.joda.time.DateTime; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Component; @Component public class ConsoleActivityMonitor { private static final Logger LOGGER = Logger.getLogger(ConsoleActivityMonitor.class); private final TimeProvider timeProvider; private final JobInstanceService jobInstanceService; private final ServerHealthService serverHealthService; private final GoConfigService goConfigService; private ConsoleService consoleService; private final ConcurrentMap<JobIdentifier, Long> jobLastActivityMap; private final long warningThreshold; @Autowired public ConsoleActivityMonitor(TimeProvider timeProvider, SystemEnvironment systemEnvironment, JobInstanceService jobInstanceService, ServerHealthService serverHealthService, GoConfigService goConfigService, ConsoleService consoleService) { this.timeProvider = timeProvider; this.jobInstanceService = jobInstanceService; this.serverHealthService = serverHealthService; this.goConfigService = goConfigService; this.consoleService = consoleService; this.jobLastActivityMap = new ConcurrentHashMap<>(); warningThreshold = systemEnvironment.getUnresponsiveJobWarningThreshold(); jobInstanceService.registerJobStateChangeListener(new ActiveJobListener(this)); } public void populateActivityMap() { long now = timeProvider.currentTimeMillis(); for (JobIdentifier jobIdentifier : jobInstanceService.allBuildingJobs()) { jobLastActivityMap.put(jobIdentifier, now); } LOGGER.info(String.format("Found '%s' building jobs. Added them with '%s' as the last heard time", jobLastActivityMap.size(), new DateTime(now))); } public void consoleUpdatedFor(JobIdentifier jobIdentifier) { long now = timeProvider.currentTimeMillis(); Long previously = jobLastActivityMap.replace(jobIdentifier, now); if (previously != null && now - previously > warningThreshold) { removeHungJobWarning(jobIdentifier); } } public void cancelUnresponsiveJobs(ScheduleService scheduleService) { long currentTime = timeProvider.currentTimeMillis(); for (Map.Entry<JobIdentifier, Long> jobTimeEntry : jobLastActivityMap.entrySet()) { long difference = currentTime - jobTimeEntry.getValue(); JobIdentifier jobIdentifier = jobTimeEntry.getKey(); if (shouldCancelHungJob(jobIdentifier, difference)) { scheduleService.cancelJob(jobIdentifier); try { consoleService.appendToConsoleLog(jobIdentifier, String.format("Go cancelled this job as it has not generated any console output for more than %s minute(s)", inMinutes(jobTerminationThreshold(jobIdentifier)))); } catch (Exception e) { LOGGER.error(String.format("Failed to update console log with reason for cancelling hung job '%s'", jobIdentifier.buildLocator()), e); } this.jobLastActivityMap.remove(jobIdentifier); removeHungJobWarning(jobIdentifier); LOGGER.info(String.format("Cancelled hung job '%s' as it was hung for more than '%s' minutes", jobIdentifier.buildLocator(), inMinutes(difference))); } else if (difference > warningThreshold) { LOGGER.info(String.format("Job '%s' has not updated console log for more than '%s' minutes", jobIdentifier.buildLocator(), inMinutes(difference))); removeHungJobWarning(jobIdentifier); addJobHungWarning(jobIdentifier, difference); } } } private void addJobHungWarning(JobIdentifier jobIdentifier, long difference) { String namespacedJob = String.format("%s/%s/%s", jobIdentifier.getPipelineName(), jobIdentifier.getStageName(), jobIdentifier.getBuildName()); serverHealthService.update(ServerHealthState.warningWithHtml( String.format("Job '%s' is not responding", namespacedJob), String.format("Job <a href='/go/tab/build/detail/%s'>%s</a> is currently running but has not shown any console activity in the last %s minute(s). This job may be hung.", jobIdentifier.buildLocator(), namespacedJob, inMinutes(difference)), HealthStateType.general(HealthStateScope.forJob(jobIdentifier.getPipelineName(), jobIdentifier.getStageName(), jobIdentifier.getBuildName())))); } private String inMinutes(long difference) { return String.valueOf(difference / 1000 / 60); } private void removeHungJobWarning(JobIdentifier jobIdentifier) { serverHealthService.removeByScope(HealthStateScope.forJob(jobIdentifier.getPipelineName(), jobIdentifier.getStageName(), jobIdentifier.getBuildName())); } private boolean shouldCancelHungJob(JobIdentifier jobIdentifier, long difference) { return goConfigService.canCancelJobIfHung(jobIdentifier) && difference > jobTerminationThreshold(jobIdentifier); } private long jobTerminationThreshold(JobIdentifier jobIdentifier) { return goConfigService.getUnresponsiveJobTerminationThreshold(jobIdentifier); } static final class ActiveJobListener implements JobStatusListener { private final ConsoleActivityMonitor consoleActivityMonitor; private ActiveJobListener(ConsoleActivityMonitor consoleActivityMonitor) { this.consoleActivityMonitor = consoleActivityMonitor; } public void jobStatusChanged(JobInstance job) { JobIdentifier identifier = job.getIdentifier(); if (job.getState().isBuilding()) { consoleActivityMonitor.jobLastActivityMap.putIfAbsent(identifier, consoleActivityMonitor.timeProvider.currentTimeMillis()); } else if (job.isCompleted() || job.isRescheduled()) { consoleActivityMonitor.jobLastActivityMap.remove(identifier); consoleActivityMonitor.removeHungJobWarning(identifier); } } } }