/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.crawler.framework;
import org.archive.crawler.event.StatSnapshotEvent;
import org.archive.crawler.reporting.CrawlStatSnapshot;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.ApplicationEvent;
import org.springframework.context.ApplicationListener;
/**
* Bean to enforce limits on the size of a crawl in URI count,
* byte count, or elapsed time. Fires off the StatSnapshotEvent,
* so only checks at the interval (configured in StatisticsTracker)
* of those events.
*
* @contributor gojomo
*/
public class CrawlLimitEnforcer implements ApplicationListener<ApplicationEvent> {
/**
* Maximum number of bytes to download. Once this number is exceeded
* the crawler will stop. A value of zero means no upper limit.
*/
protected long maxBytesDownload = 0L;
public long getMaxBytesDownload() {
return maxBytesDownload;
}
public void setMaxBytesDownload(long maxBytesDownload) {
this.maxBytesDownload = maxBytesDownload;
}
protected long maxNovelBytes = 0L;
public long getMaxNovelBytes() {
return maxNovelBytes;
}
/**
* Maximum number of uncompressed payload bytes to write to WARC response or
* resource records. Once this number is exceeded the crawler will stop. A
* value of zero means no upper limit.
*/
public void setMaxNovelBytes(long maxNovelBytes) {
this.maxNovelBytes = maxNovelBytes;
}
protected long maxNovelUrls = 0L;
public long getMaxNovelUrls() {
return maxNovelUrls;
}
/**
* Maximum number of novel (not deduplicated) urls to download. Once this
* number is exceeded the crawler will stop. A value of zero means no upper
* limit.
*/
public void setMaxNovelUrls(long maxNovelUrls) {
this.maxNovelUrls = maxNovelUrls;
}
protected long maxWarcNovelUrls = 0L;
public long getMaxWarcNovelUrls() {
return maxWarcNovelUrls;
}
/**
* Maximum number of urls to write to WARC response or resource records.
* Once this number is exceeded the crawler will stop. A value of zero means
* no upper limit.
*/
public void setMaxWarcNovelUrls(long maxWarcNovelUrls) {
this.maxWarcNovelUrls = maxWarcNovelUrls;
}
protected long maxWarcNovelBytes = 0L;
public long getMaxWarcNovelBytes() {
return maxWarcNovelBytes;
}
/**
* Maximum number of novel (not deduplicated) bytes to write to WARC
* response or resource records. Once this number is exceeded the crawler
* will stop. A value of zero means no upper limit.
*/
public void setMaxWarcNovelBytes(long maxWarcNovelBytes) {
this.maxWarcNovelBytes = maxWarcNovelBytes;
}
/**
* Maximum number of documents to download. Once this number is exceeded the
* crawler will stop. A value of zero means no upper limit.
*/
protected long maxDocumentsDownload = 0L;
public long getMaxDocumentsDownload() {
return maxDocumentsDownload;
}
public void setMaxDocumentsDownload(long maxDocumentsDownload) {
this.maxDocumentsDownload = maxDocumentsDownload;
}
/**
* Maximum amount of time to crawl (in seconds). Once this much time has
* elapsed the crawler will stop. A value of zero means no upper limit.
*/
protected long maxTimeSeconds = 0L;
public long getMaxTimeSeconds() {
return maxTimeSeconds;
}
public void setMaxTimeSeconds(long maxTimeSeconds) {
this.maxTimeSeconds = maxTimeSeconds;
}
protected CrawlController controller;
public CrawlController getCrawlController() {
return this.controller;
}
@Autowired
public void setCrawlController(CrawlController controller) {
this.controller = controller;
}
@Override
public void onApplicationEvent(ApplicationEvent event) {
if(event instanceof StatSnapshotEvent) {
CrawlStatSnapshot snapshot = ((StatSnapshotEvent)event).getSnapshot();
checkForLimitsExceeded(snapshot);
}
}
protected void checkForLimitsExceeded(CrawlStatSnapshot snapshot) {
if (maxBytesDownload > 0 && snapshot.bytesProcessed >= maxBytesDownload) {
controller.requestCrawlStop(CrawlStatus.FINISHED_DATA_LIMIT);
} else if (maxNovelBytes > 0 && snapshot.novelBytes >= maxNovelBytes) {
controller.requestCrawlStop(CrawlStatus.FINISHED_DATA_LIMIT);
} else if (maxWarcNovelBytes > 0 && snapshot.warcNovelBytes >= maxWarcNovelBytes) {
controller.requestCrawlStop(CrawlStatus.FINISHED_DATA_LIMIT);
} else if (maxDocumentsDownload > 0
&& snapshot.downloadedUriCount >= maxDocumentsDownload) {
controller.requestCrawlStop(CrawlStatus.FINISHED_DOCUMENT_LIMIT);
} else if (maxNovelUrls > 0
&& snapshot.novelUriCount >= maxNovelUrls) {
controller.requestCrawlStop(CrawlStatus.FINISHED_DOCUMENT_LIMIT);
} else if (maxWarcNovelUrls > 0
&& snapshot.warcNovelUriCount >= maxWarcNovelUrls) {
controller.requestCrawlStop(CrawlStatus.FINISHED_DOCUMENT_LIMIT);
} else if (maxTimeSeconds > 0
&& snapshot.elapsedMilliseconds >= maxTimeSeconds * 1000) {
controller.requestCrawlStop(CrawlStatus.FINISHED_TIME_LIMIT);
}
}
}