/*******************************************************************************
*
* Copyright (c) 2012 GigaSpaces Technologies Ltd. All rights reserved
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
******************************************************************************/
package org.openspaces.grid.gsm.machines;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeoutException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.openspaces.admin.Admin;
import org.openspaces.admin.gsa.GSAReservationId;
import org.openspaces.admin.gsa.GridServiceAgent;
import org.openspaces.admin.gsc.GridServiceContainer;
import org.openspaces.admin.internal.gsa.InternalGridServiceAgents;
import org.openspaces.admin.internal.zone.config.ZonesConfigUtils;
import org.openspaces.admin.pu.ProcessingUnit;
import org.openspaces.admin.zone.config.ExactZonesConfig;
import org.openspaces.admin.zone.config.ZonesConfig;
import org.openspaces.grid.gsm.SingleThreadedPollingLog;
import org.openspaces.grid.gsm.capacity.CapacityRequirements;
import org.openspaces.grid.gsm.capacity.CapacityRequirementsPerAgent;
import org.openspaces.grid.gsm.containers.ContainersSlaUtils;
import org.openspaces.grid.gsm.machines.backup.MachinesState;
import org.openspaces.grid.gsm.machines.exceptions.UndeployInProgressException;
import org.openspaces.grid.gsm.machines.isolation.ElasticProcessingUnitMachineIsolation;
import org.openspaces.grid.gsm.machines.isolation.PublicMachineIsolation;
import com.gigaspaces.document.DocumentProperties;
import com.gigaspaces.internal.version.PlatformLogicalVersion;
public class MachinesSlaEnforcementState {
public static class StateKey implements Comparable<StateKey>{
ProcessingUnit pu;
ZonesConfig gridServiceAgentZones;
public StateKey (ProcessingUnit pu, ZonesConfig gridServiceAgentZones) {
this.pu = pu;
this.gridServiceAgentZones = gridServiceAgentZones;
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result
+ ((gridServiceAgentZones == null) ? 0 : gridServiceAgentZones.hashCode());
result = prime * result + ((pu == null) ? 0 : pu.hashCode());
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
StateKey other = (StateKey) obj;
if (gridServiceAgentZones == null) {
if (other.gridServiceAgentZones != null)
return false;
} else if (!gridServiceAgentZones.equals(other.gridServiceAgentZones))
return false;
if (pu == null) {
if (other.pu != null)
return false;
} else if (!pu.equals(other.pu))
return false;
return true;
}
@Override
public String toString() {
return "StateKey ["
+ (pu != null ? "pu=" + pu.getName() + ", " : "")
+ (gridServiceAgentZones != null ? "agentZones=" + gridServiceAgentZones : "")
+ "]";
}
@Override
public int compareTo(StateKey o) {
return this.toString().compareTo(o.toString());
}
}
class StateValue {
private CapacityRequirementsPerAgent allocatedCapacity = new CapacityRequirementsPerAgent();
private final List<GridServiceAgentFutures> futureAgents = new ArrayList<GridServiceAgentFutures>();
private CapacityRequirementsPerAgent markedForDeallocationCapacity = new CapacityRequirementsPerAgent();
private ElasticProcessingUnitMachineIsolation machineIsolation;
private List<FutureStoppedMachine> machinesBeingStopped = new ArrayList<FutureStoppedMachine>();
private boolean completedStateRecoveryAfterRestart;
private List<RecoveringFailedGridServiceAgent> failedAgents = new ArrayList<RecoveringFailedGridServiceAgent>();
public void addFutureStoppedMachine(FutureStoppedMachine futureStoppedMachine) {
machinesBeingStopped.add(futureStoppedMachine);
machinesStateVersion++;
}
public void removeFutureStoppedMachine(FutureStoppedMachine futureStoppedMachine) {
machinesBeingStopped.remove(futureStoppedMachine);
machinesStateVersion++;
}
public Collection<FutureStoppedMachine> getMachineGoingDown() {
return Collections.unmodifiableList(new ArrayList<FutureStoppedMachine>(this.machinesBeingStopped));
}
public void addFutureAgents(FutureGridServiceAgent[] newFutureAgents, CapacityRequirements capacityRequirements) {
futureAgents.add(new GridServiceAgentFutures(newFutureAgents,capacityRequirements));
machinesStateVersion++;
}
public void allocateCapacity(String agentUid, CapacityRequirements capacity) {
if (machineIsolation == null) {
throw new IllegalStateException(this + " should have set machine isolation before allocating capacity");
}
MachinesSlaEnforcementState.this.logger.trace("Adding {" + agentUid + ", " + capacity + "} to allocatedCapacity = " + allocatedCapacity.toDetailedString());
allocatedCapacity = allocatedCapacity.add(agentUid,capacity);
machinesStateVersion++;
}
public void markCapacityForDeallocation(String agentUid, CapacityRequirements capacity) {
if (machineIsolation == null) {
throw new IllegalStateException(this + " should have set machine isolation before marking capacity for de-allocation");
}
MachinesSlaEnforcementState.this.logger.trace("Subtracting {" + agentUid + ", " + capacity + "} from allocatedCapacity = " + allocatedCapacity);
allocatedCapacity = allocatedCapacity.subtract(agentUid,capacity);
MachinesSlaEnforcementState.this.logger.trace("Adding {" + agentUid + ", " + capacity + "} to markedForDeallocationCapacity = " + markedForDeallocationCapacity);
markedForDeallocationCapacity = markedForDeallocationCapacity.add(agentUid, capacity);
machinesStateVersion++;
}
public void unmarkCapacityForDeallocation(String agentUid, CapacityRequirements capacity) {
if (machineIsolation == null) {
throw new IllegalStateException(this + " should have set machine isolation before un-marking capacity for de-allocation");
}
MachinesSlaEnforcementState.this.logger.trace("Subtracting {" + agentUid + ", " + capacity + "} from markedForDeallocationCapacity = " + markedForDeallocationCapacity);
markedForDeallocationCapacity = markedForDeallocationCapacity.subtract(agentUid, capacity);
allocateCapacity(agentUid, capacity);
machinesStateVersion++;
}
public void deallocateCapacity(String agentUid, CapacityRequirements capacity) {
if (machineIsolation == null) {
throw new IllegalStateException(this + " should have set machine isolation before de-allocating capacity");
}
MachinesSlaEnforcementState.this.logger.trace("Subtracting {" + agentUid + ", " + capacity + "} from markedForDeallocationCapacity = " + markedForDeallocationCapacity);
markedForDeallocationCapacity = markedForDeallocationCapacity.subtract(agentUid, capacity);
machinesStateVersion++;
}
public void replaceAllocation(String oldAgentUid, String newAgentUid) {
final CapacityRequirements agentDeallocationCapacity = markedForDeallocationCapacity.getAgentCapacityOrZero(oldAgentUid);
if (!agentDeallocationCapacity.equalsZero()) {
markedForDeallocationCapacity = markedForDeallocationCapacity.subtractAgent(oldAgentUid).add(newAgentUid, agentDeallocationCapacity);
}
final CapacityRequirements agentAllocatedCapacity = allocatedCapacity.getAgentCapacityOrZero(oldAgentUid);
if (!agentAllocatedCapacity.equalsZero()) {
allocatedCapacity = allocatedCapacity.subtractAgent(oldAgentUid);
allocateCapacity(newAgentUid, agentAllocatedCapacity);
}
machinesStateVersion++;
}
public Collection<GridServiceAgentFutures> getAllDoneFutureAgents() {
final List<GridServiceAgentFutures> doneFutures = new ArrayList<GridServiceAgentFutures>();
for (GridServiceAgentFutures future : futureAgents) {
if (future.isDone()) {
doneFutures.add(future);
}
}
return doneFutures;
}
public void removeFutureAgents(GridServiceAgentFutures futureAgentsToRemove) {
if (machineIsolation == null) {
throw new IllegalStateException(this + " should have set machine isolation before removing future agent");
}
futureAgents.remove(futureAgentsToRemove);
machinesStateVersion++;
}
public void completedStateRecoveryAfterRestart() {
completedStateRecoveryAfterRestart = true;
}
@Override
public String toString() {
return "StateValue ["
+ (allocatedCapacity != null ? "allocatedCapacity=" + allocatedCapacity + ", " : "")
+ (futureAgents != null ? "futureAgents=" + futureAgents + ", " : "")
+ (markedForDeallocationCapacity != null ? "markedForDeallocationCapacity="
+ markedForDeallocationCapacity + ", " : "")
+ (machineIsolation != null ? "machineIsolation=" + machineIsolation + ", " : "")
+ "completedStateRecoveryAfterRestart="
+ completedStateRecoveryAfterRestart + ", "
+ "failedAgents=" + failedAgents + "]";
}
public boolean equalsZero() {
return allocatedCapacity.equalsZero()
&& markedForDeallocationCapacity.equalsZero()
&& futureAgents.isEmpty();
}
public Collection<RecoveringFailedGridServiceAgent> getFailedAgents() {
return Collections.unmodifiableCollection(failedAgents);
}
public void addFailedAgent(RecoveringFailedGridServiceAgent failedAgent) {
failedAgents.add(failedAgent);
machinesStateVersion++;
}
public void removeFailedAgent(String agentUid) {
final Iterator<RecoveringFailedGridServiceAgent> it = failedAgents.iterator();
while (it.hasNext()) {
if (it.next().getAgentUid().equals(agentUid)) {
it.remove();
}
}
machinesStateVersion++;
}
}
private final Log logger;
private final Map<StateKey,StateValue> state;
public enum RecoveryState {
NOT_RECOVERED, RECOVERY_SUCCESS, RECOVERY_FAILED
}
private final Map<ProcessingUnit, RecoveryState> recoveredStatePerProcessingUnit;
private final Set<ProcessingUnit> validatedUndeployNotInProgressPerProcessingUnit;
private final Map<ProcessingUnit, FutureCleanupCloudResources> cloudCleanupPerProcessingUnit;
private final Map<String, String> agentWithFailoverDisabledPerIpAddress;
private final Map<String, Object> agentsContext;
private long machinesStateVersion;
public MachinesSlaEnforcementState() {
this.logger =
new SingleThreadedPollingLog(
LogFactory.getLog(MachinesSlaEnforcementState.class));
state = new HashMap<StateKey,StateValue>();
recoveredStatePerProcessingUnit = new HashMap<ProcessingUnit, MachinesSlaEnforcementState.RecoveryState>();
validatedUndeployNotInProgressPerProcessingUnit = new HashSet<ProcessingUnit>();
cloudCleanupPerProcessingUnit = new HashMap<ProcessingUnit, FutureCleanupCloudResources>();
agentWithFailoverDisabledPerIpAddress = new HashMap<String, String>();
agentsContext = new LinkedHashMap<String,Object>();
machinesStateVersion = 0;
}
public boolean isHoldingStateForProcessingUnit(ProcessingUnit pu) {
return !getGridServiceAgentsZones(pu).isEmpty();
}
private StateValue getState(StateKey key) {
if (!state.containsKey(key)) {
state.put(key, new StateValue());
}
return state.get(key);
}
public void addFutureAgents(StateKey key, FutureGridServiceAgent[] futureAgents, CapacityRequirements capacityRequirements) {
getState(key).addFutureAgents(futureAgents, capacityRequirements);
}
public void allocateCapacity(StateKey key, String agentUid, CapacityRequirements capacity) {
getState(key).allocateCapacity(agentUid, capacity);
}
public void markCapacityForDeallocation(StateKey key, String agentUid, CapacityRequirements capacity) {
getState(key).markCapacityForDeallocation(agentUid, capacity);
}
public void unmarkCapacityForDeallocation(StateKey key, String agentUid, CapacityRequirements capacity) {
getState(key).unmarkCapacityForDeallocation(agentUid, capacity);
}
public void deallocateCapacity(StateKey key, String agentUid, CapacityRequirements capacity) {
getState(key).deallocateCapacity(agentUid, capacity);
}
public CapacityRequirementsPerAgent getCapacityMarkedForDeallocation(StateKey key) {
return getState(key).markedForDeallocationCapacity;
}
public CapacityRequirementsPerAgent getAllocatedCapacity(StateKey key) {
return getState(key).allocatedCapacity;
}
public CapacityRequirementsPerAgent getAllocatedCapacityOfOtherKeysFromSamePu(StateKey key) {
CapacityRequirementsPerAgent capacityRequirementsPerAgent = new CapacityRequirementsPerAgent();
for (Entry<StateKey, StateValue> pair : state.entrySet()) {
StateKey otherKey = pair.getKey();
if (otherKey.pu.equals(key.pu) &&
!otherKey.gridServiceAgentZones.equals(key.gridServiceAgentZones)) {
//same pu, different agent zone
CapacityRequirementsPerAgent otherAllocatedCapacity = getAllocatedCapacity(otherKey);
capacityRequirementsPerAgent = capacityRequirementsPerAgent.add(otherAllocatedCapacity);
}
}
return capacityRequirementsPerAgent;
}
public int getNumberOfFutureAgents(StateKey key) {
return getState(key).futureAgents.size();
}
public Collection<GridServiceAgentFutures> getFutureAgents(StateKey key) {
return Collections.unmodifiableCollection(getState(key).futureAgents);
}
public Collection<GridServiceAgentFutures> getAllDoneFutureAgents(StateKey key) {
return getState(key).getAllDoneFutureAgents();
}
/**
* Lists all grid service agents from all processing units including those that are marked for deallocation.
*/
public Collection<String> getAllUsedAgentUids() {
return getAllUsedCapacity().getAgentUids();
}
/**
* Lists all capacity from all processing units including those that are marked for deallocation.
*/
public CapacityRequirementsPerAgent getAllUsedCapacity() {
CapacityRequirementsPerAgent allUsedCapacity = new CapacityRequirementsPerAgent();
for (StateKey key: state.keySet()) {
allUsedCapacity = allUsedCapacity.add(getAllUsedCapacity(key));
}
return allUsedCapacity;
}
/**
* Lists all capacity from all processing units including those that are marked for deallocation for specified key.
*/
private CapacityRequirementsPerAgent getAllUsedCapacity(StateKey key) {
final StateValue value = getState(key);
return (value.allocatedCapacity).add(value.markedForDeallocationCapacity);
}
/**
* @return true if processing units other than the specified PU, also use the specified agent. false otherwise.
*/
public boolean isAgentSharedWithOtherProcessingUnits(ProcessingUnit pu, String agentUid) {
for (Entry<StateKey, StateValue> pair : state.entrySet()) {
if (pair.getKey().pu.equals(pu)) {
continue;
}
StateValue value = pair.getValue();
if (!value.allocatedCapacity.getAgentCapacityOrZero(agentUid).equalsZero() ||
!value.markedForDeallocationCapacity.getAgentCapacityOrZero(agentUid).equalsZero()) {
return true;
}
}
return false;
}
public Collection<FutureStoppedMachine> getMachinesGoingDown(StateKey key) {
return getState(key).getMachineGoingDown();
}
public void markAgentAsFailed(StateKey key, String agentUid) {
markAgentCapacityForDeallocation(key, agentUid);
addFailedAgent(key, agentUid);
}
public void markAgentRestrictedForPu(StateKey key, String agentUid) {
markAgentCapacityForDeallocation(key, agentUid);
}
private void markAgentCapacityForDeallocation(StateKey key, String uid) {
CapacityRequirements agentCapacity = getAllocatedCapacity(key).getAgentCapacity(uid);
markCapacityForDeallocation(key, uid,agentCapacity);
}
public void deallocateAgentCapacity(StateKey key, String agentUid) {
CapacityRequirements agentCapacity = getCapacityMarkedForDeallocation(key).getAgentCapacity(agentUid);
deallocateCapacity(key, agentUid , agentCapacity);
}
/**
* @param exactZones - the exact zones that the grid service agent should have
* @return all Grid Service Agent UIDs that the specified PU cannot be deploy on due to machine isolation restrictions
* or due to the fact that the machine is about to be deployed by another PU that started it.
* The map keys contain the agent UIDs, and the map values contains the reasons for the restriction.
*/
public Map<String,List<String>> getRestrictedAgentUids(StateKey key) {
Admin admin = key.pu.getAdmin();
ElasticProcessingUnitMachineIsolation puIsolation = getState(key).machineIsolation;
Map<String,List<String>> restrictedAgentUidsWithReason = new HashMap<String,List<String>>();
if (!(puIsolation instanceof PublicMachineIsolation)) {
//find all PUs with different machine isolation, and same machine isolation
final Collection<StateKey> keysWithDifferentIsolation = getKeysWithDifferentIsolation(key);
final Collection<StateKey> keysWithSameIsolation = getKeysWithSameIsolation(key);
for (StateKey otherKey: keysWithDifferentIsolation) {
StateValue otherValue = getState(otherKey);
for (String agentUid : otherValue.allocatedCapacity.getAgentUids()) {
initValue(restrictedAgentUidsWithReason, agentUid);
restrictedAgentUidsWithReason.get(agentUid).add(otherKey.pu + "machineIsolation=" + getState(otherKey).machineIsolation + " allocated on machine which restricts " + key.pu.getName() + " machineIsolation="+getState(key).machineIsolation);
}
for (String agentUid : otherValue.markedForDeallocationCapacity.getAgentUids()) {
initValue(restrictedAgentUidsWithReason, agentUid);
restrictedAgentUidsWithReason.get(agentUid).add(otherKey.pu + "machineIsolation=" + getState(otherKey).machineIsolation + " marked for deallocation on machine which restricts " + key.pu.getName() + " machineIsolation="+getState(key).machineIsolation);
}
for (FutureStoppedMachine futureStoppedMachine : otherValue.getMachineGoingDown()) {
GridServiceAgent agent = futureStoppedMachine.getGridServiceAgent();
initValue(restrictedAgentUidsWithReason, agent.getUid());
restrictedAgentUidsWithReason.get(agent.getUid()).add(otherKey.pu + "machineIsolation=" + getState(otherKey).machineIsolation + " is shutting down the agent which restricts " + key.pu.getName() + " machineIsolation="+getState(key).machineIsolation);
}
}
// add all agents that started containers that are not with the same isolation
Set<ZonesConfig> allowedContainerZoness = new HashSet<ZonesConfig>();
for (StateKey otherKey : keysWithSameIsolation) {
allowedContainerZoness.add(otherKey.pu.getRequiredContainerZones());
}
for (GridServiceContainer container : admin.getGridServiceContainers()) {
if (container.getGridServiceAgent() == null) {
// ignore manually started containers using gsc.bat
continue;
}
boolean allowed = false;
for (ZonesConfig allowedContainerZones : allowedContainerZoness) {
if (container.getExactZones().isStasfies(allowedContainerZones)) {
allowed = true;
break;
}
}
if (!allowed) {
String agentUid = container.getGridServiceAgent().getUid();
initValue(restrictedAgentUidsWithReason, agentUid);
restrictedAgentUidsWithReason.get(agentUid).add("Machine has a container with restricted zones " + ContainersSlaUtils.gscToString(container));
}
}
}
// add all future grid service agents that have been started but not allocated yet
Map<GSAReservationId, Collection<GridServiceAgent>> agentsByReservationId = ((InternalGridServiceAgents)admin.getGridServiceAgents()).getAgentsGroupByReservationId();
Map<GSAReservationId, StateKey> futureAgentsReservationIds = getFutureAgentsReservationIds();
for (Entry<GSAReservationId, StateKey> pair : futureAgentsReservationIds.entrySet()) {
GSAReservationId reservationId = pair.getKey();
StateKey startedTheAgent = pair.getValue();
Collection<GridServiceAgent> reservedAgents = agentsByReservationId.get(reservationId);
if (reservedAgents != null) {
for (GridServiceAgent agent : reservedAgents) {
String agentUid = agent.getUid();
initValue(restrictedAgentUidsWithReason, agentUid);
restrictedAgentUidsWithReason.get(agentUid).add("Agent has been started by " + startedTheAgent +" but not allocated yet. ReservationID=" + reservationId);
}
}
}
if (key.gridServiceAgentZones != null) {
//add all agents that do not have this specific zone
//notice that unlike MachinesSlaUtils#zoneFilter that only validates MachineProvisioning.getGSAZones()
//this check is more restrictive. The machine is ok, just we cannot deploy on it with the given key
for (GridServiceAgent agent : admin.getGridServiceAgents()) {
if (!agent.getExactZones().isStasfies(key.gridServiceAgentZones)) {
String agentUid = agent.getUid();
initValue(restrictedAgentUidsWithReason, agentUid);
restrictedAgentUidsWithReason.get(agentUid).add("Agent zones=" + agent.getExactZones().getZones() +" does not match " + key.gridServiceAgentZones);
}
}
}
return restrictedAgentUidsWithReason;
}
private Collection<StateKey> getKeysWithSameIsolation(StateKey key) {
final ElasticProcessingUnitMachineIsolation puIsolation = getState(key).machineIsolation;
final Collection<StateKey> keysWithSameIsolation = new HashSet<StateKey>();
for (Entry<StateKey, StateValue> pair : state.entrySet()) {
ElasticProcessingUnitMachineIsolation otherPuIsolation = pair.getValue().machineIsolation;
if (otherPuIsolation == null) {
throw new IllegalStateException(pair.getKey() + " should have set machine isolation");
}
if (otherPuIsolation.equals(puIsolation)) {
keysWithSameIsolation.add(pair.getKey());
}
}
if (logger.isDebugEnabled()) {
logger.debug("PUs with same isolation of " + key + " are: " + keysWithSameIsolation);
}
return keysWithSameIsolation;
}
private Collection<StateKey> getKeysWithDifferentIsolation(StateKey key) {
final ElasticProcessingUnitMachineIsolation puIsolation = getState(key).machineIsolation;
final Collection<StateKey> keysWithDifferentIsolation = new HashSet<StateKey>();
for (final Entry<StateKey, StateValue> pair : state.entrySet()) {
final ElasticProcessingUnitMachineIsolation otherPuIsolation = pair.getValue().machineIsolation;
if (otherPuIsolation == null) {
throw new IllegalStateException(pair.getKey() + " should have set machine isolation");
}
if (!otherPuIsolation.equals(puIsolation)) {
keysWithDifferentIsolation.add(pair.getKey());
}
}
if (logger.isDebugEnabled()) {
logger.debug("PUs with different isolation than pu " + key +" are: "+ keysWithDifferentIsolation);
}
return keysWithDifferentIsolation;
}
private Map<GSAReservationId, StateKey> getFutureAgentsReservationIds() {
Map<GSAReservationId, StateKey> reservationIds = new HashMap<GSAReservationId, MachinesSlaEnforcementState.StateKey>();
for (Entry<StateKey, StateValue> pair : state.entrySet()) {
for (GridServiceAgentFutures futureAgents: pair.getValue().futureAgents) {
for (GSAReservationId reservationId : futureAgents.getReservationIds()) {
reservationIds.put(reservationId, pair.getKey());
}
}
}
return reservationIds;
}
private void initValue(Map<String, List<String>> mapOfLists, String key) {
if (!mapOfLists.containsKey(key)) {
mapOfLists.put(key, new LinkedList<String>());
}
}
public void removeSuccesfullyStartedFutureAgents(StateKey key, GridServiceAgentFutures doneFutureAgents) {
getState(key).removeFutureAgents(doneFutureAgents);
for (final FutureGridServiceAgent doneFutureAgent : doneFutureAgents.getFutureGridServiceAgents()) {
final FailedGridServiceAgent failedAgent = doneFutureAgent.getFailedGridServiceAgent();
if (failedAgent != null) {
// remove failed agent, since the new machine replaces it
final String failedAgentUid = failedAgent.getAgentUid();
for (StateKey okey : state.keySet()) {
unmarkAgentAsFailed(okey, failedAgentUid);
}
removeAgentContext(failedAgentUid);
}
//store agent context, so it could be resurrected if fails
try {
final String agentUid = doneFutureAgent.get().getAgent().getUid();
final Object agentContext = doneFutureAgent.get().getAgentContext();
addAgentContext(agentUid, agentContext);
} catch (ExecutionException e) {
throw new IllegalStateException(e);
} catch (TimeoutException e) {
throw new IllegalStateException(e);
}
}
}
public void unmarkAgentAsFailed(StateKey key, String agentUid) {
getState(key).removeFailedAgent(agentUid);
}
private void addAgentContext(String agentUid, Object agentContext) {
agentsContext.put(agentUid, agentContext);
machinesStateVersion++;
}
public Object getAgentContext(String agentUid) {
return agentsContext.get(agentUid);
}
public void removeFutureStoppedMachine(StateKey key, FutureStoppedMachine futureStoppedMachine) {
getState(key).removeFutureStoppedMachine(futureStoppedMachine);
final String agentUid = futureStoppedMachine.getGridServiceAgent().getUid();
removeAgentContext(agentUid);
}
private void removeAgentContext(final String agentUid) {
agentsContext.remove(agentUid);
machinesStateVersion++;
}
public Collection<FutureStoppedMachine> getMachinesGoingDown() {
List<FutureStoppedMachine> machinesGoingDown = new ArrayList<FutureStoppedMachine>();
for (StateValue value : state.values()) {
machinesGoingDown.addAll(value.getMachineGoingDown());
}
return Collections.unmodifiableList(new ArrayList<FutureStoppedMachine>(machinesGoingDown));
}
public void addFutureStoppedMachine(StateKey key, FutureStoppedMachine futureStoppedMachine) {
getState(key).addFutureStoppedMachine(futureStoppedMachine);
}
public Collection<String> getUsedAgentUids(StateKey key) {
StateValue stateValue = getState(key);
return stateValue.allocatedCapacity.add(stateValue.markedForDeallocationCapacity)
.getAgentUids();
}
public void setMachineIsolation(StateKey key, ElasticProcessingUnitMachineIsolation isolation) {
if (isolation == null) {
throw new IllegalArgumentException("machine isolation cannot be null");
}
if (logger.isDebugEnabled()) {
logger.debug(key + " machine isolation is " + isolation);
}
getState(key).machineIsolation = isolation;
}
public ElasticProcessingUnitMachineIsolation getMachineIsolation(StateKey key) {
ElasticProcessingUnitMachineIsolation machineIsolation = getState(key).machineIsolation;
if (machineIsolation == null) {
throw new IllegalStateException(key + " machine isolation has not been defined");
}
return machineIsolation;
}
public boolean isCompletedStateRecovery(StateKey key) {
return getState(key).completedStateRecoveryAfterRestart;
}
public void completedStateRecovery(StateKey key) {
getState(key).completedStateRecoveryAfterRestart();
}
public void recoveredStateOnEsmStart(ProcessingUnit otherPu) {
recoveredStatePerProcessingUnit.put(otherPu,RecoveryState.RECOVERY_SUCCESS);
}
public void failedRecoveredStateOnEsmStart(ProcessingUnit otherPu) {
recoveredStatePerProcessingUnit.put(otherPu,RecoveryState.RECOVERY_FAILED);
}
public RecoveryState getRecoveredStateOnEsmStart(ProcessingUnit pu) {
RecoveryState recoveryState = recoveredStatePerProcessingUnit.get(pu);
if (recoveryState == null) {
recoveryState = RecoveryState.NOT_RECOVERED;
}
return recoveryState;
}
public Set<ZonesConfig> getGridServiceAgentsZones(ProcessingUnit pu) {
Set<ZonesConfig> zones = new HashSet<ZonesConfig>();
for (StateKey key : getStateForProcessingUnit(pu).keySet()) {
zones.add(key.gridServiceAgentZones);
}
return zones;
}
public Set<ZonesConfig> getUndeployedGridServiceAgentsZones(ProcessingUnit pu) {
Set<ZonesConfig> zones = new HashSet<ZonesConfig>();
for (Entry<StateKey, StateValue> pair : state.entrySet()) {
if (pair.getKey().pu.equals(pu)) {
if (pair.getValue().equalsZero()) {
zones.add(pair.getKey().gridServiceAgentZones);
}
}
}
return zones;
}
public Map<GridServiceAgent, Map<ProcessingUnit, CapacityRequirements>> groupCapacityPerProcessingUnitPerAgent(StateKey key) {
// create a report for each relevant agent - which pus are installed on it and how much capacity they are using
final Map<GridServiceAgent,Map<ProcessingUnit,CapacityRequirements>> capacityPerPuPerAgent = new HashMap<GridServiceAgent,Map<ProcessingUnit,CapacityRequirements>>();
Admin admin = key.pu.getAdmin();
Collection<String> restrictedAgentUids = getRestrictedAgentUids(key).keySet();
for (Entry<StateKey, StateValue> pair : state.entrySet()) {
ProcessingUnit otherPu = pair.getKey().pu;
CapacityRequirementsPerAgent otherPuCapacityPerAgents = pair.getValue().allocatedCapacity;
for (String agentUid : otherPuCapacityPerAgents.getAgentUids()) {
GridServiceAgent agent = admin.getGridServiceAgents().getAgentByUID(agentUid);
if (!restrictedAgentUids.contains(agentUid) && agent != null) {
if (!capacityPerPuPerAgent.containsKey(agent)) {
//lazy init
capacityPerPuPerAgent.put(agent, new HashMap<ProcessingUnit, CapacityRequirements>());
}
if (!capacityPerPuPerAgent.get(agent).containsKey(otherPu)) {
capacityPerPuPerAgent.get(agent).put(otherPu, new CapacityRequirements());
}
CapacityRequirements otherPuCapacityOnAgent = capacityPerPuPerAgent.get(agent).get(otherPu);
CapacityRequirements otherPuCapacityOnAgentIncrease = otherPuCapacityPerAgents.getAgentCapacity(agentUid);
otherPuCapacityOnAgent = otherPuCapacityOnAgent.add(otherPuCapacityOnAgentIncrease);
capacityPerPuPerAgent.get(agent).put(otherPu, otherPuCapacityOnAgent);
}
}
}
return capacityPerPuPerAgent;
}
public CapacityRequirementsPerAgent getAllocatedCapacity(ProcessingUnit otherPu) {
CapacityRequirementsPerAgent capacityRequirementsPerAgent = new CapacityRequirementsPerAgent();
for (ZonesConfig zones : getGridServiceAgentsZones(otherPu)) {
capacityRequirementsPerAgent = capacityRequirementsPerAgent.add(getAllocatedCapacity(new StateKey(otherPu,zones)));
}
return capacityRequirementsPerAgent;
}
/**
* Changes the key.zone of the allocated capacity to match the exact zone of the agent
* @return false if nothing changed, true if replace occurred
*/
public boolean replaceAllocatedCapacity(StateKey key, Admin admin) {
boolean changed = false;
final CapacityRequirementsPerAgent allocatedCapacityPerAgent = getState(key).allocatedCapacity;
Collection<String> agentUids = new ArrayList<String>(allocatedCapacityPerAgent.getAgentUids()); //copy before iteration
for (String agentUid : agentUids) {
final GridServiceAgent agent = admin.getGridServiceAgents().getAgentByUID(agentUid);
//Agent could be null if it failed but it's failover is temporarily disabled.
if (agent == null) {
continue;
}
final ExactZonesConfig agentZones = agent.getExactZones();
if (!key.gridServiceAgentZones.equals(agentZones)) {
// the key.agentZones is different than agentZones
// move allocation from key.agentZones to agentZones
final CapacityRequirements capacity = allocatedCapacityPerAgent.getAgentCapacity(agentUid);
markCapacityForDeallocation(key, agentUid, capacity);
deallocateCapacity(key, agentUid, capacity);
final StateKey newKey = new StateKey(key.pu, agentZones);
setMachineIsolation(newKey, getMachineIsolation(key));
allocateCapacity(newKey, agentUid, capacity);
changed = true;
}
}
if (getState(key).allocatedCapacity.equalsZero()) {
removeKey(key);
}
return changed;
}
private void removeKey(StateKey key) {
if (!getState(key).equalsZero()) {
throw new IllegalStateException("Cannot remove " + key + " since it does not equal zero " + getState(key));
}
state.remove(key);
}
public RecoveringFailedGridServiceAgent[] getAgentsMarkedAsFailedNotBeingRecovered(StateKey key) {
final Set<String> restartingAgentUids = new LinkedHashSet<String>();
for (GridServiceAgentFutures futureAgents : getFutureAgents(key)) {
for (FutureGridServiceAgent futureAgent: futureAgents.getFutureGridServiceAgents() ) {
final FailedGridServiceAgent failedAgent = futureAgent.getFailedGridServiceAgent();
if (failedAgent != null) {
restartingAgentUids.add(failedAgent.getAgentUid());
}
}
}
final List<RecoveringFailedGridServiceAgent> failedAgentsForKey = new ArrayList<RecoveringFailedGridServiceAgent>();
for (RecoveringFailedGridServiceAgent failedAgent : getState(key).getFailedAgents()) {
if (!restartingAgentUids.contains(failedAgent.getAgentUid())) {
failedAgentsForKey.add(failedAgent);
}
}
return failedAgentsForKey.toArray(new RecoveringFailedGridServiceAgent[failedAgentsForKey.size()]);
}
public RecoveringFailedGridServiceAgent[] getAgentsMarkedAsFailed(StateKey key) {
Collection<RecoveringFailedGridServiceAgent> failedAgentsForKey = getState(key).getFailedAgents();
return failedAgentsForKey.toArray(new RecoveringFailedGridServiceAgent[failedAgentsForKey.size()]);
}
private void addFailedAgent(StateKey key, String agentUid) {
RecoveringFailedGridServiceAgent failedAgent = null;
for (final StateValue value : state.values()) {
for (final RecoveringFailedGridServiceAgent otherFailedAgent : value.getFailedAgents()) {
if (otherFailedAgent.getAgentUid().equals(agentUid)) {
// this is not the first time we detected this agent failed
// another PU/key detected it first.
failedAgent = otherFailedAgent;
break;
}
}
}
if (failedAgent == null) {
//this is the first time we detected this agent failed
failedAgent = new RecoveringFailedGridServiceAgent(agentUid);
}
getState(key).addFailedAgent(failedAgent);
}
public void beforeUndeployProcessingUnit(ProcessingUnit pu) {
validatedUndeployNotInProgressPerProcessingUnit.remove(pu);
}
/**
* Removes all state related to the specified processing unit
* Call this method only if you are not going to call any other state method on this pu
*/
public void afterUndeployProcessingUnit(ProcessingUnit pu) {
Iterator<StateKey> stateKeyIterator = state.keySet().iterator();
while(stateKeyIterator.hasNext()) {
ProcessingUnit statePu = stateKeyIterator.next().pu;
if (statePu.equals(pu)) {
stateKeyIterator.remove();
}
}
recoveredStatePerProcessingUnit.remove(pu);
cloudCleanupPerProcessingUnit.remove(pu);
}
public Map<StateKey, StateValue> getStateForProcessingUnit(ProcessingUnit pu) {
//treemap is needed for deterministic toString
Map<StateKey, StateValue> pustate = new TreeMap<StateKey,StateValue>();
for (Entry<StateKey, StateValue> pair : state.entrySet()) {
if (pair.getKey().pu.equals(pu)) {
if (!pair.getValue().equalsZero()) {
pustate.put(pair.getKey(), pair.getValue());
}
}
}
return pustate;
}
public void validateUndeployNotInProgress(ProcessingUnit pu) throws UndeployInProgressException {
if (!validatedUndeployNotInProgressPerProcessingUnit.contains(pu)) {
// undeploy of processing unit is in process somewhere else
Map<StateKey, StateValue> filteredState = getStateForProcessingUnit(pu);
if (!filteredState.isEmpty()) {
UndeployInProgressException undeployInProgressException = new UndeployInProgressException(pu);
logger.info(undeployInProgressException.getMessage() + " Details: "+ filteredState.toString(), undeployInProgressException);
throw undeployInProgressException;
}
// undeploy is not in progress
validatedUndeployNotInProgressPerProcessingUnit.add(pu);
}
}
/**
* @return true - If there are future machines in other keys that can be shared with this key
*/
public boolean isFutureAgentsOfOtherSharedServices(StateKey key) {
final Collection<StateKey> keysWithSameIsolation = getKeysWithSameIsolation(key);
for (Entry<StateKey, StateValue> pair : state.entrySet()) {
StateKey otherKey = pair.getKey();
if (!key.equals(otherKey) &&
keysWithSameIsolation.contains(otherKey) &&
getNumberOfFutureAgents(otherKey) > 0) {
return true;
}
}
return false;
}
public FutureCleanupCloudResources getCleanupFuture(ProcessingUnit pu) {
return cloudCleanupPerProcessingUnit.get(pu);
}
public void setCleanupFuture(ProcessingUnit pu, FutureCleanupCloudResources future) {
cloudCleanupPerProcessingUnit.put(pu, future);
}
/**
* @return true - if was previously not disabled and now is disabled
* false- if already was disabled
*/
public String disableFailoverDetection(String ipAddress, String agentUid) {
return agentWithFailoverDisabledPerIpAddress.put(ipAddress, agentUid);
}
public String enableFailoverDetection(String ipAddress) {
return agentWithFailoverDisabledPerIpAddress.remove(ipAddress);
}
public boolean isAgentFailoverDisabled(String agentUid) {
return agentWithFailoverDisabledPerIpAddress.values().contains(agentUid);
}
public String getAgentWithDisabledFailoverDetectionForIpAddress(String ipAddress) {
return agentWithFailoverDisabledPerIpAddress.get(ipAddress);
}
public void replaceAllocation(String otherAgentUid, String newAgentUid) {
for (StateValue puState : state.values()) {
puState.replaceAllocation(otherAgentUid, newAgentUid);
}
final Object context = agentsContext.remove(otherAgentUid);
if (context != null) {
addAgentContext(newAgentUid, context);
}
}
/**
* @return A map that can be saved into the space.
* Note: If you change this method, change also #fromDocumentProperties and update #version properly.
*/
public MachinesState toMachinesState() {
final List<DocumentProperties> agentsProperties = new ArrayList<DocumentProperties>();
for (StateKey key: state.keySet()) {
for (String agentUid : getAllUsedCapacity(key).getAgentUids()) {
final boolean isStopping = false;
final boolean isFailed = false;
agentsProperties.add(toAgentProperties(key, agentUid, isStopping, isFailed));
}
for (RecoveringFailedGridServiceAgent failedAgent : getAgentsMarkedAsFailed(key)) {
final String agentUid = failedAgent.getAgentUid();
final boolean isStopping = false;
final boolean isFailed = true;
agentsProperties.add(toAgentProperties(key, agentUid, isStopping, isFailed));
}
for (FutureStoppedMachine stoppingAgent : getMachinesGoingDown(key)) {
final String agentUid = stoppingAgent.getGridServiceAgent().getUid();
final boolean isStopping = true;
final boolean isFailed = false;
agentsProperties.add(toAgentProperties(key, agentUid, isStopping, isFailed));
}
}
final DocumentProperties properties = new DocumentProperties()
.setProperty("platformLogicalVersion", PlatformLogicalVersion.getLogicalVersion())
.setProperty("agentsContext", agentsContext)
.setProperty("agentsProperties", agentsProperties);
final MachinesState machinesState = new MachinesState();
machinesState.setProperties(properties);
machinesState.setVersion(machinesStateVersion);
return machinesState;
}
private DocumentProperties toAgentProperties(
final StateKey key,
final String agentUid,
final boolean isStopping,
final boolean isFailed) {
final String agentZone = ZonesConfigUtils.zonesToString(key.gridServiceAgentZones);
final String puName = key.pu.getName();
return new DocumentProperties()
.setProperty("puName", puName)
.setProperty("agentZones", agentZone)
.setProperty("agentUid", agentUid)
.setProperty("isStopping", isStopping)
.setProperty("isFailed", isFailed);
}
public void fromMachinesState(MachinesState state) {
machinesStateVersion = state.getVersion();
DocumentProperties properties = state.getProperties();
agentsContext.clear();
agentsContext.putAll((Map<String, Object>)properties.getProperty("agentsContext"));
//detect failed agents
final Collection<String> allUsedAgentUids = getAllUsedAgentUids();
final Map<String, ProcessingUnit> allProcessingUnits = getAllProcessingUnits();
final List<DocumentProperties> agentsProperties = properties.getProperty("agentsProperties");
for (DocumentProperties agentProperties : agentsProperties) {
final boolean isStopping = (Boolean)agentProperties.getProperty("isStopping");
final String agentUid = agentProperties.getProperty("agentUid");
final String puName = agentProperties.getProperty("puName");
final ProcessingUnit pu = allProcessingUnits.get(puName);
boolean isFailed = (Boolean)agentProperties.getProperty("isFailed");
if (pu == null) {
logger.info("Ignoring missing " + puName + " agent " + agentUid + " since " + puName + " was uninstalled");
isFailed = false;
}
else if (!allUsedAgentUids.contains(agentUid)) {
if (isStopping) {
logger.info("Ignoring missing " + puName + " agent " + agentUid + " since it was being stopped");
}
else if (isFailed) {
logger.info("Marking " + puName + " agent " + agentUid + " as failed since it was previously marked as failed.");
}
else {
// Agent probably failed while ESM was restarting
logger.info("Marking " + puName + " agent " + agentUid + " as failed since it cannot be discovered.");
isFailed = true;
}
}
if (isFailed) {
final ZonesConfig agentZones = ZonesConfigUtils.zonesFromString((String)agentProperties.getProperty("agentZones"));
final StateKey key = new StateKey(pu, agentZones);
addFailedAgent(key, agentUid);
}
}
}
private Map<String, ProcessingUnit> getAllProcessingUnits() {
final Map<String, ProcessingUnit> puNames = new LinkedHashMap<String, ProcessingUnit>();
for (StateKey key : this.state.keySet()) {
final ProcessingUnit pu = key.pu;
puNames.put(pu.getName(), pu);
}
return puNames;
}
public long getVersion() {
return machinesStateVersion;
}
}