package org.archive.crawler.monitor;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.archive.crawler.event.StatSnapshotEvent;
import org.archive.crawler.framework.CrawlController;
import org.archive.spring.ConfigPath;
import org.archive.spring.ConfigPathConfigurer;
import org.archive.util.ArchiveUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.ApplicationEvent;
import org.springframework.context.ApplicationListener;
/**
* Monitors the available space on the paths configured. If the available space
* drops below a specified threshold a crawl pause is requested.
* <p>
* Monitoring is done via the <code>java.io.File.getUsableSpace()</code> method.
* This method will sometimes fail on network attached storage, returning 0
* bytes available even if that is not actually the case.
* <p>
* Paths that do not resolve to actual filesystem folders or files will not be
* evaluated (i.e. if <code>java.io.File.exists()</code> returns <code>false</code>
* no further processing is carried out on that File).
* <p>
* Paths are checked available space whenever a {@link StatSnapshotEvent} occurs.
*
* @contributor Kristinn Sigurðsson
*/
public class DiskSpaceMonitor implements ApplicationListener<ApplicationEvent> {
private static final Logger logger = Logger.getLogger(DiskSpaceMonitor.class.getName());
protected List<String> monitorPaths = new ArrayList<String>();
protected long pauseThresholdMiB = 500;
protected CrawlController controller;
protected ConfigPathConfigurer configPathConfigurer;
protected boolean monitorConfigPaths = true;
/**
* @param monitorPaths List of filesystem paths that should be monitored for available space.
*/
public void setMonitorPaths(List<String> monitorPaths) {
this.monitorPaths = monitorPaths;
}
public List<String> getMonitorPaths() {
return this.monitorPaths;
}
/**
* Set the minimum amount of space that must be available on all monitored paths.
* If the amount falls below this pause threshold on any path the crawl will be paused.
*
* @param pauseThresholdMiB The desired pause threshold value.
* Specified in megabytes (MiB).
*/
public void setPauseThresholdMiB(long pauseThresholdMiB) {
this.pauseThresholdMiB = pauseThresholdMiB;
}
public long getPauseThresholdMiB() {
return this.pauseThresholdMiB;
}
/**
* If enabled, all the paths returned by {@link ConfigPathConfigurer#getAllConfigPaths()}
* will be monitored in addition to any paths explicitly specified via
* {@link #setMonitorPaths(List)}.
* <p>
* <code>true</code> by default.
* <p>
* <em>Note:</em> This is not guaranteed to contain all paths that Heritrix writes to.
* It is the responsibility of modules that write to disk to register their activity
* with the {@link ConfigPathConfigurer} and some may not do so.
*
* @param monitorConfigPaths If config paths should be monitored for usable space.
*/
public void setMonitorConfigPaths(boolean monitorConfigPaths){
this.monitorConfigPaths = monitorConfigPaths;
}
public boolean getMonitorConfigPaths(){
return this.monitorConfigPaths;
}
/** Autowire access to CrawlController **/
@Autowired
public void setCrawlController(CrawlController controller) {
this.controller = controller;
}
public CrawlController getCrawlController() {
return this.controller;
}
/** Autowire access to ConfigPathConfigurer **/
@Autowired
public void setConfigPathConfigurer(ConfigPathConfigurer configPathConfigurer) {
this.configPathConfigurer = configPathConfigurer;
}
public ConfigPathConfigurer getConfigPathConfigurer() {
return this.configPathConfigurer;
}
/**
* Checks available space on {@link StatSnapshotEvent}s.
*/
@Override
public void onApplicationEvent(ApplicationEvent event) {
if (event instanceof StatSnapshotEvent) {
// Check available space every time the statistics tracker
// updates its sample, by default every 20 sec.
for (String path : getMonitorPaths()) {
checkAvailableSpace(new File(path));
}
if (monitorConfigPaths) {
for(ConfigPath path : configPathConfigurer.getAllConfigPaths().values()) {
checkAvailableSpace(path.getFile());
}
}
}
}
/**
* Probe via File.getUsableSpace to see if monitored paths have fallen below
* the pause threshold. If so, request a crawl pause.
*
* @path The filesystem path to check for usable space
*/
protected void checkAvailableSpace(File path) {
if (!path.exists()) {
// Paths that can not be resolved will not report accurate
// available space. Log and ignore.
logger.fine("Ignoring non-existent path " + path.getAbsolutePath());
return;
}
long availBytes = path.getUsableSpace();
long thresholdBytes = getPauseThresholdMiB() * 1024 * 1024;
if (availBytes < thresholdBytes && controller.isActive()) {
// Enact pause
controller.requestCrawlPause();
// Log issue
String errorMsg = "Low Disk Pause - %d bytes (%s) available on %s, "
+ "this is below the minimum threshold of %d bytes (%s)";
logger.log(Level.SEVERE, String.format(errorMsg, availBytes,
ArchiveUtils.formatBytesForDisplay(availBytes),
path.getAbsolutePath(), thresholdBytes,
ArchiveUtils.formatBytesForDisplay(thresholdBytes)));
}
}
}