/* * JBoss, Home of Professional Open Source. * Copyright 2014, Red Hat, Inc., and individual contributors * as indicated by the @author tags. See the copyright.txt file in the * distribution for a full listing of individual contributors. * * This is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this software; if not, write to the Free * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA * 02110-1301 USA, or see the FSF site: http://www.fsf.org. */ package org.jboss.as.domain.management.controller; import static org.jboss.as.controller.descriptions.ModelDescriptionConstants.ACTIVE_OPERATION; import static org.jboss.as.controller.descriptions.ModelDescriptionConstants.CORE; import static org.jboss.as.controller.descriptions.ModelDescriptionConstants.DOMAIN_ROLLOUT; import static org.jboss.as.controller.descriptions.ModelDescriptionConstants.DOMAIN_UUID; import static org.jboss.as.controller.descriptions.ModelDescriptionConstants.EXCLUSIVE_RUNNING_TIME; import static org.jboss.as.controller.descriptions.ModelDescriptionConstants.EXECUTION_STATUS; import static org.jboss.as.controller.descriptions.ModelDescriptionConstants.MANAGEMENT_OPERATIONS; import static org.jboss.as.controller.descriptions.ModelDescriptionConstants.RUNNING_TIME; import java.util.ArrayList; import java.util.List; import java.util.Set; import java.util.TreeSet; import java.util.concurrent.TimeUnit; import org.jboss.as.controller.AttributeDefinition; import org.jboss.as.controller.OperationContext; import org.jboss.as.controller.OperationDefinition; import org.jboss.as.controller.OperationFailedException; import org.jboss.as.controller.OperationStepHandler; import org.jboss.as.controller.PathAddress; import org.jboss.as.controller.SimpleAttributeDefinitionBuilder; import org.jboss.as.controller.SimpleOperationDefinitionBuilder; import org.jboss.as.controller.client.helpers.MeasurementUnit; import org.jboss.as.controller.logging.ControllerLogger; import org.jboss.as.controller.operations.validation.IntRangeValidator; import org.jboss.as.controller.registry.OperationEntry; import org.jboss.as.controller.registry.Resource; import org.jboss.as.domain.management._private.DomainManagementResolver; import org.jboss.as.domain.management.logging.DomainManagementLogger; import org.jboss.dmr.ModelNode; import org.jboss.dmr.ModelType; /** * {@link org.jboss.as.controller.OperationStepHandler} that looks for and returns the id of single operation that is in * execution status {@link org.jboss.as.controller.OperationContext.ExecutionStatus#AWAITING_STABILITY} * and has been executing in that status for longer than a specified {@code timeout} seconds. * * @author Brian Stansberry (c) 2014 Red Hat Inc. */ public class FindNonProgressingOperationHandler implements OperationStepHandler { private static final AttributeDefinition STABILITY_TIMEOUT = SimpleAttributeDefinitionBuilder.create("timeout", ModelType.INT, false) .setDefaultValue(new ModelNode(15)) .setValidator(new IntRangeValidator(0, true)) .setMeasurementUnit(MeasurementUnit.SECONDS) .build(); static final OperationDefinition DEFINITION = new SimpleOperationDefinitionBuilder("find-non-progressing-operation", DomainManagementResolver.getResolver(CORE, MANAGEMENT_OPERATIONS)) .setReplyType(ModelType.STRING) .withFlag(OperationEntry.Flag.HOST_CONTROLLER_ONLY) .build(); static final OperationStepHandler INSTANCE = new FindNonProgressingOperationHandler(); @Override public void execute(OperationContext context, ModelNode operation) throws OperationFailedException { final long timeout = TimeUnit.SECONDS.toNanos(STABILITY_TIMEOUT.resolveModelAttribute(context, operation).asLong()); DomainManagementLogger.ROOT_LOGGER.debugf("Identification of operation not progressing after [%d] ns has been requested", timeout); String nonProgressing = findNonProgressingOp(context, timeout); ModelNode result = context.getResult(); if (nonProgressing != null) { result.set(nonProgressing); } } static String findNonProgressingOp(OperationContext context, long timeout) throws OperationFailedException { return findNonProgressingOp(context.readResource(PathAddress.EMPTY_ADDRESS), context.getProcessType().isServer(), timeout); } // Separate from other findNonProgressingOp variant to allow unit testing without needing a mock OperationContext static String findNonProgressingOp(Resource resource, boolean forServer, long timeout) throws OperationFailedException { Resource.ResourceEntry nonProgressing = null; for (Resource.ResourceEntry child : resource.getChildren(ACTIVE_OPERATION)) { ModelNode model = child.getModel(); if (model.get(EXCLUSIVE_RUNNING_TIME).asLong() > timeout) { nonProgressing = child; ControllerLogger.MGMT_OP_LOGGER.tracef("non-progressing op: %s", nonProgressing.getModel()); break; } } if (nonProgressing != null && !forServer) { // WFCORE-263 // See if the op is non-progressing because it's the HC op waiting for commit // from the DC while other ops (i.e. ops proxied to our servers) associated // with the same domain-uuid are not completing ModelNode model = nonProgressing.getModel(); if (model.get(DOMAIN_ROLLOUT).asBoolean() && OperationContext.ExecutionStatus.COMPLETING.toString().equals(model.get(EXECUTION_STATUS).asString()) && model.hasDefined(DOMAIN_UUID)) { ControllerLogger.MGMT_OP_LOGGER.trace("Potential domain rollout issue"); String domainUUID = model.get(DOMAIN_UUID).asString(); Set<String> relatedIds = null; List<Resource.ResourceEntry> relatedExecutingOps = null; for (Resource.ResourceEntry activeOp : resource.getChildren(ACTIVE_OPERATION)) { if (nonProgressing.getName().equals(activeOp.getName())) { continue; // ignore self } ModelNode opModel = activeOp.getModel(); if (opModel.hasDefined(DOMAIN_UUID) && domainUUID.equals(opModel.get(DOMAIN_UUID).asString()) && opModel.get(RUNNING_TIME).asLong() > timeout) { if (relatedIds == null) { relatedIds = new TreeSet<String>(); // order these as an aid to unit testing } relatedIds.add(activeOp.getName()); // If the op is ExecutionStatus.EXECUTING that means it's still EXECUTING on the // server or a prepare message got lost. It would be COMPLETING if the server // had sent a prepare message, as that would result in ProxyStepHandler calling completeStep if (OperationContext.ExecutionStatus.EXECUTING.toString().equals(opModel.get(EXECUTION_STATUS).asString())) { if (relatedExecutingOps == null) { relatedExecutingOps = new ArrayList<Resource.ResourceEntry>(); } relatedExecutingOps.add(activeOp); ControllerLogger.MGMT_OP_LOGGER.tracef("Related executing: %s", opModel); } else ControllerLogger.MGMT_OP_LOGGER.tracef("Related non-executing: %s", opModel); } else ControllerLogger.MGMT_OP_LOGGER.tracef("unrelated: %s", opModel); } if (relatedIds != null) { // There are other ops associated with this domain-uuid that are also not completing // in the desired time, so we can't treat the one holding the lock as the problem. if (relatedExecutingOps != null && relatedExecutingOps.size() == 1) { // There's a single related op that's executing for too long. So we can report that one. // Note that it's possible that the same problem exists on other hosts as well // and that this cancellation will not resolve the overall problem. But, we only // get here on a slave HC and if the user is invoking this on a slave and not the // master, we'll assume they have a reason for doing that and want us to treat this // as a problem on this particular host. nonProgressing = relatedExecutingOps.get(0); } else { // Fail and provide a useful failure message. throw DomainManagementLogger.ROOT_LOGGER.domainRolloutNotProgressing(nonProgressing.getName(), timeout, domainUUID, relatedIds); } } } } return nonProgressing == null ? null : nonProgressing.getName(); } }