// Copyright 2012 Citrix Systems, Inc. Licensed under the // Apache License, Version 2.0 (the "License"); you may not use this // file except in compliance with the License. Citrix Systems, Inc. // reserves all rights not expressly granted by the License. // You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // // Automatically generated by addcopyright.py at 04/03/2012 package com.cloud.agent.manager; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; import org.apache.log4j.Logger; import com.cloud.agent.Listener; import com.cloud.agent.api.AgentControlAnswer; import com.cloud.agent.api.AgentControlCommand; import com.cloud.agent.api.Answer; import com.cloud.agent.api.Command; import com.cloud.agent.api.PingCommand; import com.cloud.agent.api.StartupCommand; import com.cloud.alert.AlertManager; import com.cloud.dc.DataCenterVO; import com.cloud.dc.HostPodVO; import com.cloud.dc.dao.ClusterDao; import com.cloud.dc.dao.DataCenterDao; import com.cloud.dc.dao.HostPodDao; import com.cloud.host.Host; import com.cloud.host.HostVO; import com.cloud.host.Status; import com.cloud.host.Status.Event; import com.cloud.host.dao.HostDao; import com.cloud.resource.ResourceManager; import com.cloud.resource.ResourceState; import com.cloud.utils.component.Inject; import com.cloud.utils.db.ConnectionConcierge; import com.cloud.utils.db.DB; import com.cloud.utils.db.SearchCriteria2; import com.cloud.utils.db.SearchCriteria.Op; import com.cloud.utils.db.SearchCriteriaService; import com.cloud.utils.time.InaccurateClock; import com.cloud.vm.VMInstanceVO; import com.cloud.vm.dao.VMInstanceDao; public class AgentMonitor extends Thread implements Listener { private static Logger s_logger = Logger.getLogger(AgentMonitor.class); private static Logger status_Logger = Logger.getLogger(Status.class); private long _pingTimeout; private HostDao _hostDao; private boolean _stop; private AgentManagerImpl _agentMgr; private VMInstanceDao _vmDao; private DataCenterDao _dcDao = null; private HostPodDao _podDao = null; private AlertManager _alertMgr; private long _msId; private ConnectionConcierge _concierge; @Inject ClusterDao _clusterDao; @Inject ResourceManager _resourceMgr; // private ConnectionConcierge _concierge; private Map<Long, Long> _pingMap; protected AgentMonitor() { } public AgentMonitor(long msId, HostDao hostDao, VMInstanceDao vmDao, DataCenterDao dcDao, HostPodDao podDao, AgentManagerImpl agentMgr, AlertManager alertMgr, long pingTimeout) { super("AgentMonitor"); _msId = msId; _pingTimeout = pingTimeout; _hostDao = hostDao; _agentMgr = agentMgr; _stop = false; _vmDao = vmDao; _dcDao = dcDao; _podDao = podDao; _alertMgr = alertMgr; _pingMap = new ConcurrentHashMap<Long, Long>(10007); // try { // Connection conn = Transaction.getStandaloneConnectionWithException(); // conn.setAutoCommit(true); // conn.setTransactionIsolation(Connection.TRANSACTION_READ_COMMITTED); // _concierge = new ConnectionConcierge("AgentMonitor", conn, true); // } catch (SQLException e) { // throw new CloudRuntimeException("Unable to get a db connection", e); // } } /** * Check if the agent is behind on ping * * @param agentId * agent or host id. * @return null if the agent is not kept here. true if behind; false if not. */ public Boolean isAgentBehindOnPing(long agentId) { Long pingTime = _pingMap.get(agentId); if (pingTime == null) { return null; } return pingTime < (InaccurateClock.getTimeInSeconds() - _pingTimeout); } public Long getAgentPingTime(long agentId) { return _pingMap.get(agentId); } public void pingBy(long agentId) { _pingMap.put(agentId, InaccurateClock.getTimeInSeconds()); } // TODO : use host machine time is not safe in clustering environment @Override public void run() { s_logger.info("Agent Monitor is started."); while (!_stop) { try { // check every 60 seconds Thread.sleep(60 * 1000); } catch (InterruptedException e) { s_logger.info("Who woke me from my slumber?"); } try { List<Long> behindAgents = findAgentsBehindOnPing(); for (Long agentId : behindAgents) { SearchCriteriaService<HostVO, HostVO> sc = SearchCriteria2.create(HostVO.class); sc.addAnd(sc.getEntity().getId(), Op.EQ, agentId); HostVO h = sc.find(); ResourceState resourceState = h.getResourceState(); if (resourceState == ResourceState.Disabled || resourceState == ResourceState.Maintenance || resourceState == ResourceState.ErrorInMaintenance) { /* Host is in non-operation state, so no investigation and direct put agent to Disconnected */ status_Logger.debug("Ping timeout but host " + agentId + " is in resource state of " + resourceState + ", so no investigation"); _agentMgr.disconnectWithoutInvestigation(agentId, Event.ShutdownRequested); } else { status_Logger.debug("Ping timeout for host " + agentId + ", do invstigation"); _agentMgr.disconnectWithInvestigation(agentId, Event.PingTimeout); } } SearchCriteriaService<HostVO, HostVO> sc = SearchCriteria2.create(HostVO.class); sc.addAnd(sc.getEntity().getResourceState(), Op.IN, ResourceState.PrepareForMaintenance, ResourceState.ErrorInMaintenance); List<HostVO> hosts = sc.list(); for (HostVO host : hosts) { long hostId = host.getId(); DataCenterVO dcVO = _dcDao.findById(host.getDataCenterId()); HostPodVO podVO = _podDao.findById(host.getPodId()); String hostDesc = "name: " + host.getName() + " (id:" + hostId + "), availability zone: " + dcVO.getName() + ", pod: " + podVO.getName(); if (host.getType() != Host.Type.Storage) { List<VMInstanceVO> vos = _vmDao.listByHostId(hostId); List<VMInstanceVO> vosMigrating = _vmDao.listVmsMigratingFromHost(hostId); if (vos.isEmpty() && vosMigrating.isEmpty()) { _alertMgr.sendAlert(AlertManager.ALERT_TYPE_HOST, host.getDataCenterId(), host.getPodId(), "Migration Complete for host " + hostDesc, "Host [" + hostDesc + "] is ready for maintenance"); _resourceMgr.resourceStateTransitTo(host, ResourceState.Event.InternalEnterMaintenance, _msId); } } } } catch (Throwable th) { s_logger.error("Caught the following exception: ", th); } } s_logger.info("Agent Monitor is leaving the building!"); } public void signalStop() { _stop = true; interrupt(); } @Override public boolean isRecurring() { return true; } @Override public boolean processAnswers(long agentId, long seq, Answer[] answers) { return false; } @Override @DB public boolean processCommands(long agentId, long seq, Command[] commands) { boolean processed = false; for (Command cmd : commands) { if (cmd instanceof PingCommand) { pingBy(agentId); } } return processed; } protected List<Long> findAgentsBehindOnPing() { List<Long> agentsBehind = new ArrayList<Long>(); long cutoffTime = InaccurateClock.getTimeInSeconds() - _pingTimeout; for (Map.Entry<Long, Long> entry : _pingMap.entrySet()) { if (entry.getValue() < cutoffTime) { agentsBehind.add(entry.getKey()); } } if (agentsBehind.size() > 0) { s_logger.info("Found the following agents behind on ping: " + agentsBehind); } return agentsBehind; } /** * @deprecated We're using the in-memory */ @Deprecated protected List<HostVO> findHostsBehindOnPing() { long time = (System.currentTimeMillis() >> 10) - _pingTimeout; List<HostVO> hosts = _hostDao.findLostHosts(time); if (s_logger.isInfoEnabled()) { s_logger.info("Found " + hosts.size() + " hosts behind on ping. pingTimeout : " + _pingTimeout + ", mark time : " + time); } for (HostVO host : hosts) { if (host.getType().equals(Host.Type.ExternalFirewall) || host.getType().equals(Host.Type.ExternalLoadBalancer) || host.getType().equals(Host.Type.TrafficMonitor) || host.getType().equals(Host.Type.SecondaryStorage)) { continue; } if (host.getManagementServerId() == null || host.getManagementServerId() == _msId) { if (s_logger.isInfoEnabled()) { s_logger.info("Asking agent mgr to investgate why host " + host.getId() + " is behind on ping. last ping time: " + host.getLastPinged()); } _agentMgr.disconnectWithInvestigation(host.getId(), Event.PingTimeout); } } return hosts; } @Override public AgentControlAnswer processControlCommand(long agentId, AgentControlCommand cmd) { return null; } @Override public void processConnect(HostVO host, StartupCommand cmd, boolean forRebalance) { if (host.getType().equals(Host.Type.TrafficMonitor) || host.getType().equals(Host.Type.SecondaryStorage)) { return; } // NOTE: We don't use pingBy here because we're initiating. _pingMap.put(host.getId(), InaccurateClock.getTimeInSeconds()); } @Override public boolean processDisconnect(long agentId, Status state) { _pingMap.remove(agentId); return true; } @Override public boolean processTimeout(long agentId, long seq) { return true; } @Override public int getTimeout() { return -1; } }