/*
* RHQ Management Platform
* Copyright (C) 2005-2008 Red Hat, Inc.
* All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation version 2 of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
package org.rhq.enterprise.server.cloud.instance;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.util.Collection;
import java.util.List;
import javax.annotation.Resource;
import javax.ejb.EJB;
import javax.ejb.Stateless;
import javax.ejb.Timeout;
import javax.ejb.Timer;
import javax.ejb.TimerConfig;
import javax.ejb.TimerService;
import javax.ejb.TransactionAttribute;
import javax.ejb.TransactionAttributeType;
import javax.persistence.EntityManager;
import javax.persistence.PersistenceContext;
import javax.persistence.Query;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.jboss.as.controller.client.ModelControllerClient;
import org.rhq.common.jbossas.client.controller.CoreJBossASClient;
import org.rhq.common.jbossas.client.controller.MCCHelper;
import org.rhq.core.domain.cloud.PartitionEventType;
import org.rhq.core.domain.cloud.Server;
import org.rhq.core.domain.cloud.Server.OperationMode;
import org.rhq.core.domain.cloud.Server.Status;
import org.rhq.core.domain.resource.Agent;
import org.rhq.core.util.exception.ThrowableUtil;
import org.rhq.enterprise.communications.GlobalSuspendCommandListener;
import org.rhq.enterprise.server.RHQConstants;
import org.rhq.enterprise.server.auth.SubjectManagerLocal;
import org.rhq.enterprise.server.cloud.PartitionEventManagerLocal;
import org.rhq.enterprise.server.cloud.StatusManagerLocal;
import org.rhq.enterprise.server.cloud.TopologyManagerLocal;
import org.rhq.enterprise.server.core.comm.ServerCommunicationsServiceUtil;
import org.rhq.enterprise.server.storage.StorageClientManager;
/**
* If you want to manipulate or report on the {@link Server} instance that
* some piece of code is currently executing on, use the {@link ServerManagerBean}.
*
* This session bean determines the identity of the server it's running on by
* reading the <code>rhq.server.high-availability.name</code> property from the
* rhq-server.properties file.
*
* The functionality provided here is useful when you need to execute something
* on every server in the cloud, such as partitioned services and data.
*
* @author Joseph Marques
*/
@Stateless
public class ServerManagerBean implements ServerManagerLocal {
private final Log log = LogFactory.getLog(ServerManagerBean.class);
static private Server.OperationMode lastEstablishedServerMode = null;
static private final String RHQ_SERVER_NAME_PROPERTY = "rhq.server.high-availability.name";
// This is set once and caches the unchanging server name, but must be set lazily as RHQ_SERVER_NAME_PROPERTY
// may not yet be set when the static block executes (typically in i-test situations).
static private String SERVER_NAME;
@Resource
private TimerService timerService;
@PersistenceContext(unitName = RHQConstants.PERSISTENCE_UNIT_NAME)
private EntityManager entityManager;
@EJB
private TopologyManagerLocal topologyManager;
@EJB
private StorageClientManager storageClientManager;
@EJB
private StatusManagerLocal agentStatusManager;
@EJB
private PartitionEventManagerLocal partitionEventManager;
@EJB
private SubjectManagerLocal subjectManager;
@EJB
private ServerManagerLocal serverManager;
public void scheduleServerHeartbeat() {
/* each time the webapp is reloaded, it would create
* duplicate events if we don't cancel the existing ones
*/
Collection<Timer> timers = timerService.getTimers();
for (Timer existingTimer : timers) {
log.debug("Found timer - attempting to cancel: " + existingTimer.toString());
try {
existingTimer.cancel();
} catch (Exception e) {
log.warn("Failed in attempting to cancel timer: " + existingTimer.toString());
}
}
// single-action timer that will trigger in 30 seconds
timerService.createIntervalTimer(30000L, 30000L, new TimerConfig(null, false));
}
@Timeout
public void handleHeartbeatTimer(Timer timer) {
try {
serverManager.beat();
} catch (Throwable t) {
log.error("Failed to handle cloud heartbeat timer - will try again later. Cause: " + t);
}
}
public int create(Server server) {
entityManager.persist(server);
return server.getId();
}
public String getIdentity() {
return getServerName();
}
private static String getServerName() {
// To support testing, or possibly a use case I can't foresee, reset the cached server name if the sysprop
// is reset. This allows different tests to use different values and eliminates test interaction issues.
// The property may return "" so also use "" as the default to ensure we set it to something useful
String result = System.getProperty(RHQ_SERVER_NAME_PROPERTY, "");
// reset cached value if sysprop differs
if (!("".equals(result) || result.equals(SERVER_NAME))) {
SERVER_NAME = result;
}
// return cached value
if (null != SERVER_NAME) {
return SERVER_NAME;
}
// we don't want to hit the DNS server repeatedly, for efficiency and also to protect ourselves
// from a DNS failure, so we cache the server name.
if ("".equals(result)) {
try {
result = InetAddress.getLocalHost().getCanonicalHostName();
} catch (UnknownHostException e) {
result = "localhost";
}
}
SERVER_NAME = result;
return result;
}
public List<Agent> getAgents() {
List<Agent> results = topologyManager.getAgentsByServerName(getServerName());
return results;
}
public List<Integer> getAndClearAgentsWithStatus() {
List<Integer> results = agentStatusManager.getAndClearAgentsWithStatusForServer(getServerName());
return results;
}
public boolean getAndClearServerStatus() {
Server server = topologyManager.getServerByName(getServerName());
if (server == null) {
return false; // don't reload caches if we don't know who we are
}
boolean hadStatus = (server.hasStatus(Status.ALERT_DEFINITION) || server
.hasStatus(Status.RESOURCE_HIERARCHY_UPDATED));
server.clearStatus(Status.ALERT_DEFINITION);
server.clearStatus(Status.RESOURCE_HIERARCHY_UPDATED);
return hadStatus;
}
public Server getServer() throws ServerNotFoundException {
Server result = topologyManager.getServerByName(getServerName());
if (result == null) {
throw new ServerNotFoundException("Could not find server name [" + getServerName()
+ "]. If the rhq-server.properties property [" + RHQ_SERVER_NAME_PROPERTY
+ "] is unset the server name defaults to the host name (via InetAddress.getLocalHost()). "
+ "If this value, possibly an IP address, has changed it can cause this issue.");
}
return result;
}
public void printWithTrace(String message) {
try {
new IllegalArgumentException(message);
} catch (IllegalArgumentException iae) {
String stackTrace = ThrowableUtil.getStackAsString(iae);
LogFactory.getLog("HighAvailabilityLogic").fatal(stackTrace);
}
}
public void establishCurrentServerMode() {
Server server = getServer();
Server.OperationMode serverMode = determineServerOperationMode(
server.hasStatus(Server.Status.MANUAL_MAINTENANCE_MODE), storageClientManager.isClusterAvailable(),
server.getOperationMode());
// no state change means no work
if (serverMode == lastEstablishedServerMode)
return;
// whenever starting up clear the agent references to this server. Agent references will exist
// for previously connected agents that did not fail-over while this server was unavailable. This
// is done to avoid unnecessary cache re/load and moreover provides a logically initialized environment.
if (null == lastEstablishedServerMode) {
printWithTrace("establishCurrentServerMode: NULL->" + serverMode + ", clearing agent references");
clearAgentReferences(server);
}
try {
if (Server.OperationMode.NORMAL == serverMode) {
// If moving into normal operating mode from Maintenance Mode then:
// 1) Ensure lingering agent references are cleared
// - this may have been done at startup already, this covers the case when we go in and
// - out of MM without ever taking down the server
// 2) Re-establish server communication by taking away the MM listener
if (Server.OperationMode.MAINTENANCE == lastEstablishedServerMode) {
printWithTrace("establishCurrentServerMode: MAINTENANCE->NORMAL, clearing agent references");
clearAgentReferences(server);
ServerCommunicationsServiceUtil.getService().safeGetServiceContainer()
.removeCommandListener(getMaintenanceModeListener());
log.info("Notified communication layer of server operation mode " + serverMode);
}
} else if (Server.OperationMode.MAINTENANCE == serverMode) {
// If moving into Maintenance Mode from any other mode then stop processing agent commands
ServerCommunicationsServiceUtil.getService().safeGetServiceContainer()
.addCommandListener(getMaintenanceModeListener());
log.info("Notified communication layer of server operation mode " + serverMode);
} else if (Server.OperationMode.INSTALLED == serverMode
// The server must have just been installed and must be coming for the first time
// up as of this call. So, attempt to update the mode to NORMAL.
// This will prevent a running CloudManagerJob from resetting to DOWN before the real
// ServerManagerJob starts updating the heart beat regularly.
|| Server.OperationMode.DOWN == serverMode) {
// The server can't be DOWN if this code is executing, it means the server must be coming
// up as of this call. So, attempt to update the mode to NORMAL.
// This will prevent a running CloudManagerJob from resetting to DOWN before the real
// ServerManagerJob starts updating the heart beat regularly.
log.info("Notified communication layer of server operation mode " + serverMode);
lastEstablishedServerMode = serverMode;
serverMode = determineServerOperationMode(server.hasStatus(Server.Status.MANUAL_MAINTENANCE_MODE),
storageClientManager.isClusterAvailable(), OperationMode.NORMAL);
if (serverMode == OperationMode.MAINTENANCE) {
ServerCommunicationsServiceUtil.getService().safeGetServiceContainer()
.addCommandListener(getMaintenanceModeListener());
}
}
// If this server just transitioned from INSTALLED to NORMAL operation mode then it
// has just been added to the cloud. Changing the number of servers in the cloud requires agent
// distribution work, even if this is a 1-Server cloud. Generate a request for a repartitioning
// of agent load, it will be executed on the next invocation of the cluster manager job.
// Otherwise, audit the operation mode change as a partition event of interest.
String audit = server.getName() + ": "
+ ((null != lastEstablishedServerMode) ? lastEstablishedServerMode : Server.OperationMode.DOWN)
+ " --> " + serverMode;
if ((Server.OperationMode.NORMAL == serverMode)
&& (Server.OperationMode.INSTALLED == lastEstablishedServerMode)) {
partitionEventManager.cloudPartitionEventRequest(subjectManager.getOverlord(),
PartitionEventType.OPERATION_MODE_CHANGE, audit);
} else {
partitionEventManager.auditPartitionEvent(subjectManager.getOverlord(),
PartitionEventType.OPERATION_MODE_CHANGE, audit);
}
lastEstablishedServerMode = serverMode;
server.setOperationMode(lastEstablishedServerMode);
server.setMtime(System.currentTimeMillis());
} catch (Throwable e) {
log.error("Unable to change HA Server Mode from " + lastEstablishedServerMode + " to " + serverMode + ": "
+ e);
}
}
private void clearAgentReferences(Server server) {
Query query = entityManager.createNamedQuery(Agent.QUERY_REMOVE_SERVER_REFERENCE);
query.setParameter("serverId", server.getId());
int numRows = query.executeUpdate();
if (numRows > 0) {
log.info("Removed " + numRows + " obsolete agent reference(s) to server " + server.getName());
}
}
// use this to ensure a listener of the same name. not using static singleton in case of class reload by different
// classloaders (in case an exception bubbles up to the slsb layer)
private GlobalSuspendCommandListener getMaintenanceModeListener() {
return new GlobalSuspendCommandListener(Server.OperationMode.MAINTENANCE.name(),
Server.OperationMode.MAINTENANCE.name());
}
public void syncEndpointAddress() throws SyncEndpointAddressException {
Server server = getServer();
try {
String hostName = InetAddress.getLocalHost().getHostName();
if (!hostName.equals(server.getAddress())) {
server.setAddress(hostName);
}
} catch (UnknownHostException e) {
throw new SyncEndpointAddressException("Failed to sync endpoint address for " + server, e);
}
}
@TransactionAttribute(TransactionAttributeType.REQUIRES_NEW)
public void beat() {
try {
Server server = getServer();
server.setMtime(System.currentTimeMillis());
} catch (ServerNotFoundException snfe) {
// an admin removed our server entity, that means we are to be decommissioned so immediately shutdown
ModelControllerClient mcc = null;
try {
log.info("This server has been decommissioned!!! It will now shutdown.");
mcc = MCCHelper.createModelControllerClient();
new CoreJBossASClient(mcc).shutdown(false);
} catch (Exception e) {
log.fatal("This server was decommissioned, however, it failed to shut itself down. This server will now behave in an indeterminate manner. Please shut it down.");
} finally {
MCCHelper.safeClose(mcc);
}
}
// Handles server mode state changes
// note: this call should be fast. if not we need to break the heart beat into its own job
establishCurrentServerMode();
}
/**
* @param manualMaintenance
* @param storageNodeUp
* @param currentOperationMode
*/
private Server.OperationMode determineServerOperationMode(boolean isManualMaintenance,
boolean isStorageClusterAvailable, Server.OperationMode requestedOperationMode) {
if (Server.OperationMode.DOWN == requestedOperationMode
|| Server.OperationMode.INSTALLED == requestedOperationMode) {
return requestedOperationMode;
}
if (Server.OperationMode.NORMAL == requestedOperationMode
|| Server.OperationMode.MAINTENANCE == requestedOperationMode) {
if (!isManualMaintenance && isStorageClusterAvailable) {
return OperationMode.NORMAL;
} else {
return OperationMode.MAINTENANCE;
}
}
throw new RuntimeException("Unable to determine new server operation mode.");
}
}