/*
* NOTE: This copyright does *not* cover user programs that use Hyperic
* program services by normal system calls through the application
* program interfaces provided as part of the Hyperic Plug-in Development
* Kit or the Hyperic Client Development Kit - this is merely considered
* normal use of the program, and does *not* fall under the heading of
* "derived work".
*
* Copyright (C) [2004-2013], VMware, Inc.
* This file is part of Hyperic.
*
* Hyperic is free software; you can redistribute it and/or modify
* it under the terms version 2 of the GNU General Public License as
* published by the Free Software Foundation. This program is distributed
* in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
* even the implied warranty of MERCHANTABILITY or FITNESS FOR A
* PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
* USA.
*/
package org.hyperic.hq.measurement.server.session;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.hyperic.hq.appdef.shared.AppdefEntityConstants;
import org.hyperic.hq.authz.server.session.Resource;
import org.hyperic.hq.authz.shared.AuthzConstants;
import org.hyperic.hq.authz.shared.ResourceManager;
import org.hyperic.hq.measurement.MeasurementConstants;
import org.hyperic.hq.measurement.TimingVoodoo;
import org.hyperic.hq.measurement.shared.AvailabilityManager;
import org.hyperic.hq.product.MetricValue;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
/**
* Availability status checker for platforms which availability status data was not received for over 2 intervals.
* <BR><B>Details:</B>
* <BR> The checker receives a collection of DataPoints (latest availability status) for platforms that need rechecking.
* <BR> It checks if there exists VC associations for the platform (if this platform status is given by a VM agent while
* there also exists a VCented agent monitoring it).
* If so - update the status according to the status given by the VCenter agent.
* <BR> If VC information exists and the Platform is UP - all its servers/services status is set as UNKNOWN.
* <BR> If VC information does not exist, or is DOWN - all its servers/services status is set as DOWN.
* <BR> Agent status is marked as DOWN in any case.
* @author amalia
*
*/
@Component
public class AvailabilityFallbackChecker {
//TODO: (Code review comments)
// Handle the case of 2 very different intervals. If the VM interval is 1 min, and the VC interval is 1 hr? Perhaps should limit the availability status validity to 5(?) times the VM interval.
private final Log log = LogFactory.getLog(AvailabilityFallbackChecker.class);
private AvailabilityManager availabilityManager;
private AvailabilityCache availabilityCache;
private ResourceManager resourceManager;
// For testing purposes, in case we need to perform checks with a constant timestamp.
// if curTimeStamp is 0, we check for the actual current time.
private long curTimeStamp = 0;
private final int MAX_UPDATES_PER_BATCH = 1000;
// --------------------------------------------------------------------------------------------------------
// --------------------------------------------------------------------------------------------------------
@Autowired
public AvailabilityFallbackChecker(AvailabilityManager availabilityManager, AvailabilityCache availabilityCache,
ResourceManager resourceManager) {
this.availabilityManager = availabilityManager;
this.availabilityCache = availabilityCache;
this.resourceManager = resourceManager;
}
// --------------------------------------------------------------------------------------------------------
// --------------------------------------------------------------------------------------------------------
/**
* Check platforms' availability with constant time stamp all through the check. Update DB/Cache accordingly.
* This means that if we use Ping checks (that may take several seconds), the timestamp will remain the same.
* Platforms' servers/services statuses may be updated as well.
* @param availabilityDataPoints - latest availability status for Platforms
* @param curTimeStamp - timestamp to use through the checks
*/
public void checkAvailability(Collection<ResourceDataPoint> availabilityDataPoints, long curTimeStamp) {
this.curTimeStamp = curTimeStamp;
checkAvailabilityInChunks(availabilityDataPoints);
this.curTimeStamp = 0;
}
/**
* Check platforms' availability. Update DB/Cache accordingly.
* Platforms' servers/services statuses may be updated as well.
* @param availabilityDataPoints
*/
public void checkAvailability(Collection<ResourceDataPoint> availabilityDataPoints) {
if ((availabilityDataPoints == null) || (availabilityDataPoints.isEmpty()) ) {
return;
}
final boolean debug = log.isDebugEnabled();
if (debug) log.debug("checkAvailability: start");
final Collection<ResourceDataPoint> resPlatforms = new ArrayList<ResourceDataPoint>();
final long currTime = getCurTimestamp();
for (ResourceDataPoint availabilityDataPoint : availabilityDataPoints) {
if (debug) log.debug("checkAvailability-Platform: " + availabilityDataPoint.getResource().getId() + " value: " + availabilityDataPoint.getValue());
ResourceDataPoint platformAvailPoint = checkPlatformAvailability(availabilityDataPoint, currTime);
if (debug) log.debug("checkAvailability-Platform: " + platformAvailPoint.getResource().getId() + " value: " + platformAvailPoint.getValue());
resPlatforms.add(platformAvailPoint);
}
if (debug) log.debug("checkAvailability: checking " + resPlatforms.size() + " platforms.");
List<DataPoint> res = addStatusOfPlatformsDescendants(resPlatforms);
if (debug) log.debug("checkAvailability: updating " + res.size() + " platforms & descendants.");
if (!res.isEmpty()) {
synchronized (availabilityCache) {
availabilityManager.addData(res, true, true);
}
}
}
/**
* This method is written in order to solve issue: HHQ-5619:
* Backfiller causes StackOverflow when trying to update 73000 resources' availability.
* The reason for the Stackoverflow is a Hibernate 3.2 bug in which queries above 9000-10000 IDs fail with NodeTraverse StackOverflow.
* TODO: This method should be removed once Hibernate is upgraded.
*
* This method is the same as CheckAvailability, only the update is done in chunks of upto MAX_UPDATES_PER_BATCH.
* The updates are divided into clusters, a cluster per Platform and its Servers and Services.
* We merge several Platform Clusters into a single Chunk, so its size does not exceed MAX_UPDATES_PER_BATCH.
* Each such chunk is updated separately.
*
* Check platforms' availability. Update DB/Cache accordingly.
* Platforms' servers/services statuses may be updated as well.
* @param availabilityDataPoints
*/
public void checkAvailabilityInChunks(Collection<ResourceDataPoint> availabilityDataPoints) {
if ((availabilityDataPoints == null) || (availabilityDataPoints.isEmpty())) {
return;
}
final boolean debug = log.isDebugEnabled();
if (debug) log.debug("checkAvailability: start");
Collection<ResourceDataPoint> resPlatforms = new ArrayList<ResourceDataPoint>();
final long currTime = getCurTimestamp();
for (ResourceDataPoint adp : availabilityDataPoints) {
ResourceDataPoint platformAvail = checkPlatformAvailability(adp, currTime);
if (debug) log.debug("checkAvailability before resourceId=" + adp + ", after resourceId=" + platformAvail);
resPlatforms.add(platformAvail);
}
if (debug) log.debug("marking " + resPlatforms.size() + " platforms down, resourceIds=" + resPlatforms);
List<DataPoint> datapoints = getAllHierarchyMeasurementData(resPlatforms);
// HHQ-5726 - only one thread can access availabilityManager.addData() at once
synchronized (availabilityCache) {
for (int i=0; i<datapoints.size(); i+=MAX_UPDATES_PER_BATCH) {
int max = Math.min(i + MAX_UPDATES_PER_BATCH, datapoints.size());
List<DataPoint> sublist = datapoints.subList(i, max);
availabilityManager.addData(sublist, true, true);
}
}
}
/**
* check if the given Measurement belongs to an HQ Agent, and if so - mark it as down.
* @param meas - Measurement of a checked server/service.
* @return true if this is an HQAgent, false otherwise.
*/
private boolean isHQAgent(Measurement meas) {
Resource measResource = meas.getResource();
// log.debug("isHQHagent? " + measResource.getName());
// TODO remove the following line, and recheck
measResource = resourceManager.getResourceById(measResource.getId());
Resource prototype = measResource.getPrototype();
if (prototype == null) {
return false;
}
String prototypeName = prototype.getName();
if (prototypeName.equals(AppdefEntityConstants.HQ_AGENT_PROTOTYPE_NAME)) {
if (log.isDebugEnabled()) {
log.debug("isHQHagent: Found: " + measResource.getId());
}
return true;
}
return false;
}
/**
* Check availability for a single platform
* @param availabilityDataPoint - latest availability status
* @return new availability status to update
*/
private ResourceDataPoint checkPlatformAvailability(ResourceDataPoint availabilityDataPoint, long currTimestamp) {
ResourceDataPoint res = getPlatformStatusFromVC(availabilityDataPoint);
if (res != null) {
return res;
}
// If we want to add a check using ping:
//res = getPlatformStatusByPing(availabilityDataPoint);
//if (res != null)
// return res;
res = availabilityDataPoint;
if (availabilityDataPoint.getMetricValue().getValue() != MeasurementConstants.AVAIL_DOWN) {
DataPoint resDP = new DataPoint(availabilityDataPoint.getMeasurementId(), MeasurementConstants.AVAIL_DOWN, currTimestamp);
res = new ResourceDataPoint(availabilityDataPoint.getResource(), resDP);
}
return res;
}
/**
* Check for availability status from VC information, if exists.
* VC information exists if this platform is monitored by a VM agent, and there also exists a VCenter agent that monitors this platform.
* @param availabilityDataPoint - latest availability status
* @return new availability status to update, if exists. Null otherwise.
*/
private ResourceDataPoint getPlatformStatusFromVC(ResourceDataPoint availabilityDataPoint) {
Integer platformId = availabilityDataPoint.getResource().getId();
if (log.isDebugEnabled()) {
log.debug("getPlatformStatusFromVC: platformId" + platformId);
}
List<Integer> resourceIds = new ArrayList<Integer>();
resourceIds.add(platformId);
final Map<Integer, List<Measurement>> virtualParent = availabilityManager.getAvailMeasurementDirectParent(
resourceIds, AuthzConstants.ResourceEdgeVirtualRelation);
if (isEmptyMap(virtualParent)) {
return null;
}
// else - there should be a single measurement of the related VM Instance:
List<Measurement> resourceEdgeVirtualRelations = virtualParent.get(platformId);
if ((resourceEdgeVirtualRelations == null) | (resourceEdgeVirtualRelations.isEmpty()) ) {
//log.info("getPlatfromStatusFromVC: Platform " + platformId + " got no virtual parents. Ignoring.");
return null;
}
if (resourceEdgeVirtualRelations.size() != 1) {
log.warn("getPlatfromStatusFromVC: Platform " + platformId + " got " + resourceEdgeVirtualRelations.size() + " virtual parents. Ignoring.");
return null;
}
// we now have the VM Instance Measurement ID. We will copy its latest availability status
Measurement vmParentMeasurement = resourceEdgeVirtualRelations.get(0);
long endTimeStamp = getEndWindow(getCurTimestamp(), vmParentMeasurement);
final DataPoint defaultParentDataPoint = new DataPoint(vmParentMeasurement.getId().intValue(), MeasurementConstants.AVAIL_NULL, endTimeStamp);
DataPoint lastParentDataPoint = null;
synchronized (availabilityCache) {
lastParentDataPoint = availabilityCache.get(vmParentMeasurement.getId(), defaultParentDataPoint);
}
if (lastParentDataPoint == null) {
return null;
}
double parentStatus = lastParentDataPoint.getValue();
if ((parentStatus == MeasurementConstants.AVAIL_UP) || (parentStatus == MeasurementConstants.AVAIL_DOWN)) {
DataPoint newDataPoint = new DataPoint(availabilityDataPoint.getMeasurementId(), lastParentDataPoint.getMetricValue());
ResourceDataPoint resPoint = new ResourceDataPoint(availabilityDataPoint.getResource(), newDataPoint);
if (log.isDebugEnabled()) {
log.debug("getPlatformStatusFromVC: found parent measurement: " + lastParentDataPoint.getMeasurementId() +
"; adding point: " + resPoint.toString());
}
return resPoint;
}
return null;
}
private boolean isEmptyMap(Map<Integer, List<Measurement>> rHierarchy) {
if (rHierarchy == null) {
return true;
}
if (rHierarchy.size() ==0) {
return true;
}
if (rHierarchy.isEmpty()) {
return true;
}
return false;
}
/**
* Given a list of platforms' data points, return a collection of datapoints of platforms' servers an services,
* with their appropriate status.
* @param checkedPlatforms - new calculated availability status of platforms.
* @return collection of statuses of the platforms' servers an services.
*/
private List<DataPoint> getAllHierarchyMeasurementData(Collection<ResourceDataPoint> checkedPlatforms) {
final boolean debug = log.isDebugEnabled();
if (debug) log.debug("addStatusOfPlatformsDescendants: start");
final List<DataPoint> res = new ArrayList<DataPoint>();
final Map<Integer, List<Measurement>> rHierarchy = getMeasurementHierarchy(checkedPlatforms);
final long currTimestamp = getCurTimestamp();
for (ResourceDataPoint rdp : checkedPlatforms) {
final Resource platform = rdp.getResource();
res.add(rdp);
final List<Measurement> associatedMeasurements = rHierarchy.get(platform.getId());
if (associatedMeasurements == null) {
continue;
}
double assocStatus = MeasurementConstants.AVAIL_DOWN;
if (rdp.getMetricValue().getValue() == MeasurementConstants.AVAIL_UP) {
assocStatus = MeasurementConstants.AVAIL_UNKNOWN;
}
for (Measurement meas : associatedMeasurements) {
if (!meas.isEnabled()) {
continue;
}
double curStatus = assocStatus;
if (isHQAgent(meas)) {
curStatus = MeasurementConstants.AVAIL_DOWN;
}
final long end = getEndWindow(currTimestamp, meas);
final DataPoint defaultPt = new DataPoint(meas.getId().intValue(), MeasurementConstants.AVAIL_NULL, end);
DataPoint lastPt = availabilityCache.get(meas.getId(), defaultPt);
final long backfillTime = lastPt.getTimestamp() + meas.getInterval();
if (backfillTime > currTimestamp) {
// the resource was updated during the last interval. we do
// not want to update it.
// TODO: Shouldn't platform be marked as UP?
continue;
}
final MetricValue val = new MetricValue(curStatus, backfillTime);
final MeasDataPoint point = new MeasDataPoint(meas.getId(), val, true);
res.add(point);
}
}
if (debug) log.debug("addStatusOfPlatformsDescendants: end, res size: " + res.size());
return res;
}
private Map<Integer, List<Measurement>> getMeasurementHierarchy(Collection<ResourceDataPoint> checkedPlatforms) {
final List<Integer> resourceIds = new ArrayList<Integer>();
for (ResourceDataPoint rDataPoint : checkedPlatforms) {
resourceIds.add(rDataPoint.getResource().getId());
}
return availabilityManager.getAvailMeasurementChildren(resourceIds, AuthzConstants.ResourceEdgeContainmentRelation);
}
/**
* Given a list of platforms' data points, return a collection of datapoints of platforms' servers an services,
* with their appropriate status.
* @param checkedPlatforms - new calculated availability status of platforms.
* @return collection of statuses of the platforms' servers an services.
*/
private List<DataPoint> addStatusOfPlatformsDescendants(Collection<ResourceDataPoint> checkedPlatforms) {
log.debug("addStatusOfPlatformsDescendants: start" );
final List<DataPoint> res = new ArrayList<DataPoint>();
final List<Integer> resourceIds = new ArrayList<Integer>();
for (ResourceDataPoint rDataPoint : checkedPlatforms) {
resourceIds.add(rDataPoint.getResource().getId());
}
final Map<Integer, List<Measurement>> rHierarchy = availabilityManager.getAvailMeasurementChildren(
resourceIds, AuthzConstants.ResourceEdgeContainmentRelation);
final long curTimeStamp = getCurTimestamp();
for (ResourceDataPoint rdp : checkedPlatforms) {
final Resource platform = rdp.getResource();
res.add(rdp);
final List<Measurement> associatedResources = rHierarchy.get(platform.getId());
if (associatedResources == null) {
continue;
}
double assocStatus = MeasurementConstants.AVAIL_DOWN;
if (rdp.getMetricValue().getValue() == MeasurementConstants.AVAIL_UP) {
assocStatus = MeasurementConstants.AVAIL_UNKNOWN;
}
for (Measurement meas : associatedResources) {
if (!meas.isEnabled()) {
continue;
}
double curStatus = assocStatus;
if (isHQAgent(meas)) {
curStatus = MeasurementConstants.AVAIL_DOWN;
}
final long backfillTime = getBackfillTime(curTimeStamp, meas);
if (backfillTime > curTimeStamp) {
// the resource was updated during the last interval. we do not want to update it.
// TODO: Shouldn't platform be marked as UP?
continue;
}
final MetricValue val = new MetricValue(curStatus, backfillTime);
final MeasDataPoint point = new MeasDataPoint(meas.getId(), val, true);
res.add(point);
}
}
if (log.isDebugEnabled()) {
log.debug("addStatusOfPlatformsDescendants: end, res size: " + res.size() );
}
return res;
}
/**
* get the time of the first interval that was not updated.
* @param current - current time stamp
* @param meas - measurement to check for
* @return the time of the first interval that was not updated.
*/
private long getBackfillTime(long current, Measurement meas) {
final long end = getEndWindow(current, meas);
final DataPoint defaultPt = new DataPoint(meas.getId().intValue(), MeasurementConstants.AVAIL_NULL, end);
DataPoint lastPt = availabilityCache.get(meas.getId(), defaultPt);
final long backfillTime = lastPt.getTimestamp() + meas.getInterval();
return backfillTime;
}
// End is at least more than 1 interval away
private long getEndWindow(long current, Measurement meas) {
return TimingVoodoo.roundDownTime((current - meas.getInterval()), meas.getInterval());
}
/**
* if curTimeStamp is 0, return the real current time.
* Otherwise - return curTimeStamp set by the calling method.
* @return time
*/
private long getCurTimestamp() {
if (curTimeStamp != 0) {
return curTimeStamp;
}
return System.currentTimeMillis();
}
}