/* * RHQ Management Platform * Copyright (C) 2005-2008 Red Hat, Inc. * All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation version 2 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ package org.rhq.enterprise.server.operation; import java.util.ArrayList; import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.quartz.JobDetail; import org.quartz.JobExecutionContext; import org.quartz.JobExecutionException; import org.rhq.core.domain.auth.Subject; import org.rhq.core.domain.configuration.Configuration; import org.rhq.core.domain.operation.GroupOperationHistory; import org.rhq.core.domain.operation.OperationDefinition; import org.rhq.core.domain.operation.OperationHistory; import org.rhq.core.domain.operation.OperationRequestStatus; import org.rhq.core.domain.operation.ResourceOperationHistory; import org.rhq.core.domain.operation.bean.GroupOperationSchedule; import org.rhq.core.domain.operation.bean.ResourceOperationSchedule; import org.rhq.core.domain.resource.Resource; import org.rhq.core.domain.resource.group.ResourceGroup; import org.rhq.core.domain.util.PageControl; import org.rhq.core.util.exception.ThrowableUtil; import org.rhq.enterprise.server.auth.SubjectManagerLocal; import org.rhq.enterprise.server.resource.ResourceManagerLocal; import org.rhq.enterprise.server.util.LookupUtil; /** * A job that invokes an operation on all resources that are members of a group. * * @author John Mazzitelli */ public class GroupOperationJob extends OperationJob { private static final Log log = LogFactory.getLog(GroupOperationJob.class); public static final String DATAMAP_INT_GROUP_ID = "groupId"; public static final String DATAMAP_INT_ARRAY_EXECUTION_ORDER = "executionOrder"; // comma-separated list of IDs public static final String DATAMAP_BOOL_HALT_ON_FAILURE = "haltOnFailure"; public static final int BREAK_VALUE = 1000 * 60 * 60 * 24; class ResourceOperationDetailsComposite { Resource resource; ResourceOperationHistory history; ResourceOperationSchedule schedule; public ResourceOperationDetailsComposite(Resource resource, ResourceOperationHistory history, ResourceOperationSchedule schedule) { this.resource = resource; this.history = history; this.schedule = schedule; } } /** * Prefix for all job names and job groups names of group operations. */ private static final String GROUP_JOB_NAME_PREFIX = "rhq-group-"; public static String createUniqueJobName(ResourceGroup group, String operationName) { return GROUP_JOB_NAME_PREFIX + group.getId() + "-" + operationName.hashCode() + "-" + System.currentTimeMillis(); } public static String createJobGroupName(ResourceGroup group) { return GROUP_JOB_NAME_PREFIX + group.getId(); } public void execute(JobExecutionContext context) throws JobExecutionException { GroupOperationSchedule schedule = null; GroupOperationHistory groupHistory; Subject user = null; try { JobDetail jobDetail = context.getJobDetail(); OperationManagerLocal operationManager = LookupUtil.getOperationManager(); updateOperationScheduleEntity(jobDetail, context.getNextFireTime(), operationManager); // we only got here because the user was allowed to execute / schedule the operation in the first place, // thus it's safe to pass in the overlord here schedule = operationManager.getGroupOperationSchedule(LookupUtil.getSubjectManager().getOverlord(), jobDetail); if (schedule == null) { throw new CancelJobException("Resource Schedule no longer exists, canceling job"); } // create a new session even if user is logged in elsewhere, we don't want to attach to that user's session user = getUserWithSession(schedule.getSubject(), false); ResourceGroup group = schedule.getGroup(); // we need the operation definition to fill in the history item OperationDefinition op; op = operationManager.getSupportedGroupOperation(user, group.getId(), schedule.getOperationName(), false); // first we need to create an INPROGRESS *group* history item Configuration parameters = schedule.getParameters(); if (parameters != null) { parameters = parameters.deepCopy(false); // we need a copy to avoid constraint violations upon delete } groupHistory = new GroupOperationHistory(jobDetail.getName(), jobDetail.getGroup(), user.getName(), op, parameters, group); groupHistory = (GroupOperationHistory) operationManager.updateOperationHistory(user, groupHistory); // get the resources to operate on, ordered or not List<Resource> resourcesToOperateOn; if (schedule.getExecutionOrder() != null) { resourcesToOperateOn = schedule.getExecutionOrder(); } else { ResourceManagerLocal resourceManager = LookupUtil.getResourceManager(); PageControl pageControl = PageControl.getUnlimitedInstance(); resourcesToOperateOn = resourceManager.findExplicitResourcesByResourceGroup(user, group, pageControl); } // now create detail composites from the resource list List<ResourceOperationDetailsComposite> resourceComposites = new ArrayList<ResourceOperationDetailsComposite>(); getUserWithSession(user, true); // refresh our session to reset the timeout clock for (Resource nextResourceToOperateOn : resourcesToOperateOn) { // create the non-quartz schedule entity for the given job execution context data ResourceOperationSchedule resourceSchedule = createScheduleForResource(schedule, jobDetail.getGroup(), user, nextResourceToOperateOn); // create the resource-level history entity for the newly created non-quartz schedule entity // this method also does the persisting ResourceOperationHistory resourceHistory = createOperationHistory(resourceSchedule.getJobName(), resourceSchedule.getJobGroup(), resourceSchedule, groupHistory, operationManager); // add all three elements to the composite, which will be iterated over below for the bulk of the work resourceComposites.add(new ResourceOperationDetailsComposite(nextResourceToOperateOn, resourceHistory, resourceSchedule)); } // now tell the agents to invoke the operation for all resources if (schedule.getExecutionOrder() != null) { boolean hadFailure = false; // synchronously execute, waiting for each operation to finish before going to the next for (ResourceOperationDetailsComposite composite : resourceComposites) { try { if (hadFailure) { // there was a failure during execution of this group operation; // thus, mark all remaining operation histories as cancelled composite.history.setErrorMessage("This has been cancelled due to halt-on-error " + "being set on the parent group operation schedule. " + "A previous resource operation that executed prior " + "to this resource operation failed, thus causing " + "this resource operation to be cancelled."); composite.history.setStatus(OperationRequestStatus.CANCELED); composite.history = (ResourceOperationHistory) operationManager.updateOperationHistory( getUserWithSession(user, true), composite.history); continue; } invokeOperationOnResource(composite, operationManager); int resourceHistoryId = composite.history.getId(); OperationHistory updatedOperationHistory = null; long sleep = 1000L; // quick sleep for fast ops, then slow down long maxSleep = 5000L; do { Thread.sleep(sleep); sleep = (sleep == maxSleep) ? sleep : sleep + 1000L; // it's unlikely but possible that a client program could actually query for, process, and // delete the history before this code gets a chance to run. If the record is gone just // assume things were handled externally. try { updatedOperationHistory = operationManager.getOperationHistoryByHistoryId( getUserWithSession(user, true), resourceHistoryId); } catch (IllegalArgumentException e) { if (log.isDebugEnabled()) { log.debug("Failed to find operation history", e); } break; } // if the duration was ridiculously long, let's break out of here. this will rarely // be triggered because our operation manager will timeout long running operations for us // (based on the operation's define timeout). But, me being paranoid, I want to be able // to break this infinite loop if for some reason the operation manager isn't doing its job. // if the operation took longer than 24 hours, this breaks the loop. if (updatedOperationHistory.getDuration() > (GroupOperationJob.BREAK_VALUE)) { break; } } while (updatedOperationHistory.getStatus() == OperationRequestStatus.INPROGRESS); // halt the rest if we got a failure and were told not to go on if (null != updatedOperationHistory && (updatedOperationHistory.getStatus() != OperationRequestStatus.SUCCESS) && schedule.isHaltOnFailure()) { hadFailure = true; } } catch (Exception e) { // failed to even send to the agent, immediately mark the job as failed groupHistory.setErrorMessage(ThrowableUtil.getStackAsString(e)); groupHistory = (GroupOperationHistory) operationManager.updateOperationHistory( getUserWithSession(user, true), groupHistory); if (schedule.isHaltOnFailure()) { hadFailure = true; } } } } else { // send the invocation requests without waiting for each to return for (ResourceOperationDetailsComposite composite : resourceComposites) { try { invokeOperationOnResource(composite, operationManager); } catch (Exception e) { if (e instanceof CancelJobException) { throw e; } // failed to even send to the agent, immediately mark the job as failed groupHistory.setErrorMessage(ThrowableUtil.getStackAsString(e)); groupHistory = (GroupOperationHistory) operationManager.updateOperationHistory( getUserWithSession(user, true), groupHistory); // Note: in actuality - I don't think users have a way in the user interface to turn on halt-on-failure for parallel execution. // So this isHaltOnFailure will probably always be false. But in case we want to support this, leave this here. // What will happen is we will stop sending requests to the agents to invoke more resource operations. Any previous // resource operations invoked, however, will still be running and allowed to finish on their respective agents. if (schedule.isHaltOnFailure()) { throw e; } } } } } catch (Exception e) { if (e instanceof CancelJobException) { throw (CancelJobException) e; } String error = "Failed to execute scheduled operation [" + schedule + "]"; LogFactory.getLog(GroupOperationJob.class).error(error, e); throw new JobExecutionException(error, e, false); } finally { // clean up our temporary session by logging out of it try { if (user != null) { SubjectManagerLocal subjectMgr = LookupUtil.getSubjectManager(); subjectMgr.logout(user); } } catch (Exception e) { log.debug("Failed to log out of temporary group operation session - will be cleaned up during session purge later: " + ThrowableUtil.getAllMessages(e)); } } } private ResourceOperationSchedule createScheduleForResource(GroupOperationSchedule schedule, String jobGroup, Subject user, Resource resource) throws Exception { ResourceOperationSchedule resourceSchedule; // We need to provide a unique job name for the group member. // The job name will be unique but it will have a job group name of the group job. // NOTE! This job-name/job-group combination will NOT have a Quartz scheduled job in the Quartz tables. // This is an invocation that JON spawns when Quartz triggers the group job. Quartz does not trigger // these group member resource jobs. String resourceJobName = ResourceOperationJob.createUniqueJobName(resource, schedule.getOperationName()); resourceSchedule = new ResourceOperationSchedule(); resourceSchedule.setJobName(resourceJobName); resourceSchedule.setJobGroup(jobGroup); resourceSchedule.setDescription(schedule.getDescription()); resourceSchedule.setOperationName(schedule.getOperationName()); resourceSchedule.setParameters(schedule.getParameters()); resourceSchedule.setSubject(user); resourceSchedule.setResource(resource); return resourceSchedule; } private void invokeOperationOnResource(ResourceOperationDetailsComposite composite, OperationManagerLocal operationManager) throws Exception { new ResourceOperationJob().invokeOperationOnResource(composite.schedule, composite.history, operationManager); } }