/* * RHQ Management Platform * Copyright (C) 2005-2008 Red Hat, Inc. * All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation version 2 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ package org.rhq.enterprise.server.cloud; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.List; import java.util.Map; import javax.ejb.EJB; import javax.ejb.Stateless; import javax.persistence.EntityManager; import javax.persistence.NoResultException; import javax.persistence.PersistenceContext; import javax.persistence.Query; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.rhq.core.domain.cloud.AffinityGroup; import org.rhq.core.domain.cloud.FailoverList; import org.rhq.core.domain.cloud.FailoverListDetails; import org.rhq.core.domain.cloud.PartitionEvent; import org.rhq.core.domain.cloud.PartitionEventDetails; import org.rhq.core.domain.cloud.Server; import org.rhq.core.domain.cloud.composite.FailoverListComposite; import org.rhq.core.domain.cloud.composite.FailoverListComposite.ServerEntry; import org.rhq.core.domain.cloud.composite.FailoverListDetailsComposite; import org.rhq.core.domain.resource.Agent; import org.rhq.enterprise.server.RHQConstants; import org.rhq.enterprise.server.core.AgentManagerLocal; /** * This session beans acts as the single interface with which the distribution algorithm * will interact. The distribution algorithm runs as a result of various changes in the * system including but not limited to: newly registering agents, currently connecting * agents, cloud membership changes (server added/removed), and redistributions according * to agent load. The result of the distribution algorithm is a single (or a set of) * {@link FailoverList} objects that are sent down to the connected agents. The agents * then use these lists to determine which server to fail over to, if their primary server * is unreachable and/or goes down. * * @author Joseph Marques * @author Jay Shaughnessy */ @Stateless public class FailoverListManagerBean implements FailoverListManagerLocal { private final Log log = LogFactory.getLog(FailoverListManagerBean.class); /** The variation in load between most loaded and least loaded server that indicates balanced load. */ private static final double ACCEPTABLE_DISPARITY = 0.10; @PersistenceContext(unitName = RHQConstants.PERSISTENCE_UNIT_NAME) private EntityManager entityManager; @EJB //@IgnoreDependency TopologyManagerLocal topologyManager; @EJB AgentManagerLocal agentManager; @EJB FailoverListManagerLocal failoverListManager; public FailoverListComposite getExistingForSingleAgent(String agentName) { Agent agent = agentManager.getAgentByName(agentName); if (null == agent) { throw new IllegalArgumentException("No agent found for registration name: " + agentName); } return doGetExistingForSingleAgent(agent); } private FailoverListComposite doGetExistingForSingleAgent(Agent agent) { FailoverListComposite result = null; Query query = entityManager.createNamedQuery(FailoverList.QUERY_GET_VIA_AGENT); query.setParameter("agent", agent); try { FailoverList serverList = (FailoverList) query.getSingleResult(); List<ServerEntry> serverEntries = new ArrayList<ServerEntry>(); for (FailoverListDetails next : serverList.getServerList()) { serverEntries.add(next.getServer().getServerEntry()); } result = new FailoverListComposite(serverEntries); } catch (NoResultException e) { result = null; } return result; } public FailoverListComposite getForSingleAgent(PartitionEvent event, String agentName) { // If a server list already exists then just return it Agent agent = agentManager.getAgentByName(agentName); if (null == agent) { throw new IllegalArgumentException("No agent found for registration name: " + agentName); } FailoverListComposite result = doGetExistingForSingleAgent(agent); if (null == result) { result = generateServerList(event, agent); } return result; } private FailoverListComposite generateServerList(PartitionEvent event, Agent agent) { List<Server> servers = topologyManager.getAllCloudServers(); List<Agent> agents = new ArrayList<Agent>(1); agents.add(agent); // get the current agent assignments for the servers // TODO (jshaughn) Note that "load" in the query name is not true load but rather the count of agents assigned to // each server (by server list ordinal). This is fine until we decide to introduce relative load values for the // agents. Even at that this algorithm may be ok for adding new agents and defer to a full repartition to apply // agent-specific load factors. Query query = entityManager.createNamedQuery(FailoverListDetails.QUERY_GET_ASSIGNED_LOADS); @SuppressWarnings("unchecked") List<FailoverListDetailsComposite> existingLoads = query.getResultList(); Map<Agent, FailoverListComposite> agentServerListMap = getForAgents(event, servers, agents, existingLoads); persistComposites(event, agentServerListMap); return (agentServerListMap.get(agent)); } public Map<Agent, FailoverListComposite> refresh(PartitionEvent event) { List<Server> servers = topologyManager.getAllCloudServers(); List<Agent> agents = agentManager.getAllAgents(); // persist results immediate, which will be the only writes (as opposed to reads) in this transaction Map<Agent, FailoverListComposite> agentServerListMap = getForAgents(event, servers, agents, null); /* now that the intense in-memory manipulation is complete, let's do the stuff that needs to persist the * results to the database; clear out the existing lists **just** before persisting the new ones to keep * row lock hold time low */ clear(); // clear out the existing server lists because we're going to generate new ones for all agents persistComposites(event, agentServerListMap); return agentServerListMap; } public Map<Agent, FailoverListComposite> refresh(PartitionEvent event, List<Server> servers, List<Agent> agents) { // do not persist results immediately, instead return the results and then delete/persist in quick succession Map<Agent, FailoverListComposite> agentServerListMap = getForAgents(event, servers, agents, null); /* now that the intense in-memory manipulation is complete, let's do the stuff that needs to persist the * results to the database; clear out the existing lists **just** before persisting the new ones to keep * row lock hold time low */ for (Agent next : agents) { // clear out the existing server lists because we're going to generate new ones for all agents deleteServerListsForAgent(next); } persistComposites(event, agentServerListMap); return agentServerListMap; } /* * NOTE: this method used to persist the agentServerListMap results at the end of processing; however, * certain callers that performed write operations before calling this method would hold row locks * too long; so, this method no longer does the persistence, which puts the onus on callers to do so; * some callers will immediately persist the results, otherwise may want to perform other updates or * deletions just prior to persistence - the caller now has that option */ private Map<Agent, FailoverListComposite> getForAgents(PartitionEvent event, List<Server> servers, List<Agent> agents, List<FailoverListDetailsComposite> existingLoads) { Map<Agent, FailoverListComposite> result = new HashMap<Agent, FailoverListComposite>(agents.size()); // create a bucket for each server to which we will assign agents List<ServerBucket> buckets = new ArrayList<ServerBucket>(servers.size()); for (Server next : servers) { buckets.add(new ServerBucket(next)); } // initialize the result map Map<Agent, List<ServerBucket>> agentServerListMap = new HashMap<Agent, List<ServerBucket>>(agents.size()); for (Agent next : agents) { agentServerListMap.put(next, new ArrayList<ServerBucket>(servers.size())); } // assign server lists level by level: primary, then secondary, then tertiary, etc for (int level = 0; (level < servers.size()); ++level) { // Initialize the bucket loads for the next round initBuckets(buckets, existingLoads, level); // assign a server for this level to each agent, balancing as we go // keep track of the how many agents have been assignd on this pass int agentsAssigned = 0; // introduce more list disparity by changing the bucket iteration direction on each level int rotate = (((level % 2) == 0) ? -1 : 1); for (Agent next : agents) { List<ServerBucket> serverList = agentServerListMap.get(next); // When assigning primary (i.e. level 0), supply the current primary as the preferred server. // This should reduce connection churn by letting most agents stay put (but affects balancing, we'll // deal with that below) ServerBucket bestBucket = null; if ((0 == level) && (null != next.getServer())) { bestBucket = ServerBucket.getBestBucket(buckets, serverList, next.getAffinityGroup(), next .getServer().getName()); } else { bestBucket = ServerBucket.getBestBucket(buckets, serverList, next.getAffinityGroup(), null); } // Rotate the list on each iteration. This enhances bucket distribution amongst the levels and ensures // that we don't starve buckets at the end of the list. Also, we alternate the rotation direction on // each level which seems to help. Collections.rotate(buckets, rotate); // Reverse the buckets completely each time we have assigned an agent to each server. This avoids // duplicating failover lists completely by not repeating the same server sequence over and over on the // same level. if ((++agentsAssigned % buckets.size() == 0)) { Collections.reverse(buckets); } if (null == bestBucket) { // this should never happen but let's defensively check and log log.error("Unexpected Condition! null bucket in getForAllAgents()"); continue; } serverList.add(bestBucket); // note that assigned load takes into consideration compute power of the server bestBucket.assignedLoad += (getAgentLoad(next) / bestBucket.computePower); bestBucket.assignedAgents.add(next); } // For debugging logServerList("Level " + level, agentServerListMap); // The first pass does a best-effort balancing as it goes but may need further balancing because: // - the assignment of primary servers tries to retain the current primary server of an existing agent. // This disrupts the load balancing (but reduces churn). // - the algorithm is greedy, assigning servers as they are available, this can overload a server near the // end of assignments (due to, for example, constraints avoiding server duplication in a server list). // Now, if necessary for load balance, force some agents to new servers. if (balanceLoad(buckets, agentServerListMap)) { // for debugging logServerList("Forced Rebalance!", agentServerListMap); } } // generate the result Map for (Agent next : agentServerListMap.keySet()) { List<ServerEntry> serverEntries = new ArrayList<ServerEntry>(servers.size()); for (ServerBucket bucket : agentServerListMap.get(next)) { serverEntries.add(bucket.serverEntry); } result.put(next, new FailoverListComposite(serverEntries)); } return result; } private void initBuckets(List<ServerBucket> buckets, List<FailoverListDetailsComposite> existingLoads, int level) { for (ServerBucket bucket : buckets) { bucket.assignedLoad = 0.0; bucket.assignedAgents.clear(); if (null != existingLoads) { int serverId = bucket.server.getId(); for (FailoverListDetailsComposite existingLoad : existingLoads) { if ((existingLoad.ordinal == level) && (existingLoad.serverId == serverId)) { bucket.assignedLoad = (existingLoad.assignedAgentCount / bucket.computePower); break; } } } } } /** * Force agents to new servers, if possible, to give us better load balance. A perfect balance is not necessarily * produced due to:<pre> * Currently, this algorithm will not break affinity. * An ACCEPTABLE_DISPARITY between the high server load and low server load is achieved * no legal swaps are possible. * <pre> */ private boolean balanceLoad(List<ServerBucket> buckets, Map<Agent, List<ServerBucket>> agentServerListMap) { boolean done = false; boolean rebalanced = false; // need at least two buckets to balance if (buckets.size() < 2) return false; do { // sort buckets from high load to low load Collections.sort(buckets, new Comparator<ServerBucket>() { public int compare(ServerBucket bucket1, ServerBucket bucket2) { return (bucket2.assignedLoad > bucket1.assignedLoad) ? 1 : -1; } }); ServerBucket lowBucket = buckets.get(buckets.size() - 1); // if the load disparity is acceptable then we're done. if (getLoadDisparity(buckets.get(0).assignedLoad, lowBucket.assignedLoad) < ACCEPTABLE_DISPARITY) { done = true; continue; } // find an agent to move by traversing the buckets from high to low (excluding lowest bucket) for (ServerBucket bucket : buckets) { // if we've looked in all of the buckets and found nothing to move then we're done if (bucket == lowBucket) { done = true; break; } AffinityGroup affinityGroup = bucket.server.getAffinityGroup(); boolean checkAffinity = ((null != affinityGroup) && !affinityGroup.equals(lowBucket.server .getAffinityGroup())); int highIndex = -1; double highLoad = 0.0; double load = 0.0; for (int i = 0, size = bucket.assignedAgents.size(); (i < size); ++i) { Agent agent = bucket.assignedAgents.get(i); // we don't move an agent with satisfied affinity to a bucket that breaks affinity if (checkAffinity && affinityGroup.equals(agent.getAffinityGroup())) { continue; } // we don't move an agent that is already assigned to lowBucket if (agentServerListMap.get(agent).contains(lowBucket)) { continue; } load = getAgentLoad(agent); if (load > highLoad) { // protect against a move that would send too much load to the lowBucket, effectively just // reversing the problem and allowing this algorithm to thrash. Don't allow a move that // increases the lowBucket load higher than the current bucket. if (!((lowBucket.assignedLoad + load) > (bucket.assignedLoad - load))) { highIndex = i; highLoad = load; } } } // If we found an agent to move then make the move, otherwise look in the next bucket if (highIndex > -1) { Agent agent = bucket.assignedAgents.remove(highIndex); lowBucket.assignedAgents.add(agent); agentServerListMap.get(agent).remove(bucket); agentServerListMap.get(agent).add(lowBucket); lowBucket.assignedLoad += highLoad; bucket.assignedLoad -= highLoad; rebalanced = true; break; } } } while (!done); return rebalanced; } private double getLoadDisparity(Double highLoad, Double lowLoad) { return ((highLoad - lowLoad) / highLoad); } // TODO (jshaughn) figure out how to measure agent load. It should be relative to all other agents, probably normalized such that the average agent // is load 1.0. All agents must have positive load. If the load needs to be computed here perhaps it should be stored on the AgentServerList // to avoid recalculation, if it is expensive. private double getAgentLoad(Agent agent) { if (null == agent) return 0.0; return 1.0; } @SuppressWarnings("unused") private void logServerList(String debugTitle, Map<Agent, List<ServerBucket>> agentServerListMap) { //if (!log.isInfoEnabled()) // return; StringBuilder sb = new StringBuilder("\nServerList ("); sb.append(debugTitle); sb.append(") :"); for (Agent agent : agentServerListMap.keySet()) { sb.append("\n\n Agent: " + agent.getName()); for (ServerBucket bucket : agentServerListMap.get(agent)) { sb.append("\n "); sb.append(bucket.assignedLoad); sb.append(" : "); sb.append(bucket.server.getName()); } } sb.append("\n\n"); System.out.println(sb.toString()); log.info(sb.toString()); } public void deleteServerListsForAgent(Agent agent) { Query query1 = entityManager.createNamedQuery(FailoverListDetails.QUERY_DELETE_VIA_AGENT); Query query2 = entityManager.createNamedQuery(FailoverList.QUERY_DELETE_VIA_AGENT); query1.setParameter("agent", agent); query2.setParameter("agent", agent); query1.executeUpdate(); query2.executeUpdate(); } public void deleteServerListDetailsForServer(int serverId) { Query query = entityManager.createNamedQuery(FailoverListDetails.QUERY_DELETE_VIA_SERVER); query.setParameter("serverId", serverId); query.executeUpdate(); } private void clear() { Query query = entityManager.createNamedQuery(FailoverListDetails.QUERY_TRUNCATE); query.executeUpdate(); query = entityManager.createNamedQuery(FailoverList.QUERY_TRUNCATE); query.executeUpdate(); } private void persistComposites(PartitionEvent event, Map<Agent, FailoverListComposite> agentServerListMap) { FailoverList fl = null; FailoverListDetails failoverListDetails = null; PartitionEventDetails eventDetails = null; for (Map.Entry<Agent, FailoverListComposite> next : agentServerListMap.entrySet()) { Agent nextAgent = next.getKey(); FailoverListComposite nextComposite = next.getValue(); fl = new FailoverList(event, nextAgent); entityManager.persist(fl); boolean first = true; for (int i = 0; i < nextComposite.size(); ++i) { ServerEntry serverEntry = nextComposite.get(i); Server server = entityManager.find(Server.class, serverEntry.serverId); failoverListDetails = new FailoverListDetails(fl, i, server); entityManager.persist(failoverListDetails); // event details only shows the current primary server topology if (first) { eventDetails = new PartitionEventDetails(event, nextAgent, server); entityManager.persist(eventDetails); first = false; } } } } }