/*
* Copyright (c) 2010-2012 Grid Dynamics Consulting Services, Inc, All Rights Reserved
* http://www.griddynamics.com
*
* This library is free software; you can redistribute it and/or modify it under the terms of
* the Apache License; either
* version 2.0 of the License, or any later version.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package com.griddynamics.jagger.monitoring;
import com.google.common.base.Throwables;
import com.griddynamics.jagger.agent.model.GetCollectedProfileFromSuT;
import com.griddynamics.jagger.agent.model.GetSystemInfo;
import com.griddynamics.jagger.agent.model.ManageCollectionProfileFromSuT;
import com.griddynamics.jagger.agent.model.SystemInfo;
import com.griddynamics.jagger.coordinator.*;
import com.griddynamics.jagger.diagnostics.thread.sampling.ProfileDTO;
import com.griddynamics.jagger.exception.TechnicalException;
import com.griddynamics.jagger.storage.fs.logging.LogProcessor;
import com.griddynamics.jagger.storage.fs.logging.LogWriter;
import com.griddynamics.jagger.util.SerializationUtils;
import com.griddynamics.jagger.util.TimeUtils;
import com.griddynamics.jagger.util.Timeout;
import org.hibernate.SessionFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.TimeoutException;
import static com.griddynamics.jagger.agent.model.ManageCollectionProfileFromSuT.ManageHotSpotMethodsFromSuT;
/**
* Perform monitoring of one agent.
*
* @author Mairbek Khadikov
*/
public class MonitorProcess extends LogProcessor implements NodeProcess<MonitoringStatus> {
public static final String PROFILER_MARKER = "PROFILER";
private static final Logger log = LoggerFactory.getLogger(MonitorProcess.class);
private final String sessionId;
private final NodeId agentId;
private final NodeContext nodeContext;
private final Coordinator coordinator;
private final ExecutorService executor;
private final long pollingInterval;
private final long profilerPollingInterval;
private final MonitoringProcessor monitoringProcessor;
private final String taskId;
private volatile boolean alive;
private LogWriter logWriter;
private CountDownLatch latch;
private final Timeout ttl;
/*package*/ MonitorProcess(String sessionId, NodeId agentId, NodeContext nodeContext, Coordinator coordinator,
ExecutorService executor, long pollingInterval, long profilerPollingInterval,
MonitoringProcessor monitoringProcessor, String taskId, LogWriter logWriter,
SessionFactory sessionFactory, Timeout ttl) {
this.sessionId = sessionId;
this.agentId = agentId;
this.nodeContext = nodeContext;
this.coordinator = coordinator;
this.executor = executor;
this.pollingInterval = pollingInterval;
this.monitoringProcessor = monitoringProcessor;
this.taskId = taskId;
this.logWriter = logWriter;
this.profilerPollingInterval = profilerPollingInterval;
this.setSessionFactory(sessionFactory);
this.ttl = ttl;
}
@Override
public void start() throws TechnicalException {
log.info("Kernel {} has started monitoring on agent {} by task id {}", new Object[]{nodeContext.getId(), agentId, taskId});
alive = true;
final RemoteExecutor remote = coordinator.getExecutor(agentId);
Runnable runnable = new Runnable() {
public void run() {
try {
VoidResult voidResult = remote.runSyncWithTimeout(new ManageCollectionProfileFromSuT(sessionId,
ManageHotSpotMethodsFromSuT.START_POLLING, profilerPollingInterval), Coordination.<ManageCollectionProfileFromSuT>doNothing(), ttl);
while (alive) {
long startTime = System.currentTimeMillis();
log.debug("try getting GetSystemInfo on kernel {} from {}", nodeContext.getId(), agentId);
try {
ArrayList<SystemInfo> info = remote.runSyncWithTimeout(new GetSystemInfo(sessionId), Coordination.<GetSystemInfo>doNothing(), ttl);
if (voidResult.hasException())
log.error("Remote exception raised during staring profiling from SuT", voidResult.getException());
log.debug("GetSystemInfo got on kernel {} from {} time {} ms",
new Object[]{nodeContext.getId(), agentId, System.currentTimeMillis() - startTime});
for (SystemInfo systemInfo : info) {
monitoringProcessor.process(sessionId, taskId, agentId, nodeContext, systemInfo);
}
log.debug("monitoring logged to file storage on kernel {}", nodeContext.getId());
} catch (Throwable e) {
log.error("Ignore GetSystemInfo from agent " + agentId + " due to error", e);
}
TimeUtils.sleepMillis(pollingInterval);
}
log.debug("try to flush monitoring on kernel {}", nodeContext.getId());
logWriter.flush();
log.debug("monitoring flushed on kernel {}", nodeContext.getId());
if (!voidResult.hasException()) {
log.debug("try to manage monitoring on agent {} from kernel {}", agentId, nodeContext.getId());
try {
voidResult = remote.runSyncWithTimeout(new ManageCollectionProfileFromSuT(sessionId, ManageHotSpotMethodsFromSuT.STOP_POLLING,
profilerPollingInterval), Coordination.<ManageCollectionProfileFromSuT>doNothing(), ttl);
log.debug("manage monitoring has done on agent {} from kernel {}", agentId, nodeContext.getId());
if (voidResult.hasException())
log.error("Remote exception raised during stopping profiling from SuT", voidResult.getException());
log.debug("try to get collected profiler from agent {} from kernel {}", agentId, nodeContext.getId());
try {
final ProfileDTO profileDTO =
remote.runSyncWithTimeout(GetCollectedProfileFromSuT.create(sessionId), Coordination.<GetCollectedProfileFromSuT>doNothing(), ttl);
if (profileDTO.getRuntimeGraphs().isEmpty()) {
log.info("Profiler of {} turned off. There is no profiler data for recording", agentId);
} else {
log.debug("got collected profiler from agent {} from kernel {}", agentId, nodeContext.getId());
logWriter.log(sessionId, taskId + "/" + PROFILER_MARKER, agentId.getIdentifier(), SerializationUtils.toString(profileDTO));
log.debug("Profiler {} received from agent {} and has been written to FileStorage", profileDTO, agentId);
logWriter.flush();
log.debug("Flushing performed on kernel {}", nodeContext.getId());
}
} catch (Throwable e) {
log.error("Get collected profile failed for agent " + agentId + "\n" + Throwables.getStackTraceAsString(e));
}
} catch (Throwable e) {
log.error("Stop polling failed for agent " + agentId + "\n" + Throwables.getStackTraceAsString(e));
}
} else {
log.warn("Collection profiling from SuT didn't start");
}
} catch (Throwable e) {
log.error("Start polling failed for agent " + agentId + "\n" + Throwables.getStackTraceAsString(e));
} finally {
log.debug("releasing a latch");
if (latch != null) {
log.debug("latch is available");
latch.countDown();
}
alive = false;
log.debug("latch released");
}
}
};
executor.execute(runnable);
}
@Override
public MonitoringStatus getStatus() {
return MonitoringStatus.PROGRESS;
}
@Override
public void stop() {
log.info("Stop of monitoring requested. agent {}", agentId);
if (alive) {
latch = new CountDownLatch(1);
alive = false;
try {
latch.await();
} catch (InterruptedException e) {
log.warn("Interrupted {}", e);
}
log.info("Kernel {} has stopped monitoring on agent {}", nodeContext.getId(), agentId);
} else {
log.warn("Monitoring on agent {} is not running. Skipping StopMonitoring", agentId);
}
}
public Timeout getTtl() {
return ttl;
}
}