/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.crawler.prefetch;
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.HashMap;
import java.util.Map;
import javax.management.openmbean.CompositeData;
import org.apache.commons.httpclient.URIException;
import org.archive.crawler.framework.CrawlerProcessorTestBase;
import org.archive.crawler.framework.Frontier;
import org.archive.crawler.framework.Frontier.FrontierGroup;
import org.archive.crawler.frontier.FrontierJournal;
import org.archive.modules.CoreAttributeConstants;
import org.archive.modules.CrawlURI;
import org.archive.modules.ProcessResult;
import org.archive.modules.deciderules.DecideRule;
import org.archive.modules.fetcher.DefaultServerCache;
import org.archive.modules.fetcher.FetchStats;
import org.archive.modules.fetcher.FetchStats.Stage;
import org.archive.modules.fetcher.FetchStatusCodes;
import org.archive.modules.net.CrawlHost;
import org.archive.modules.net.CrawlServer;
import org.archive.net.UURIFactory;
import org.archive.util.ObjectIdentityCache;
import org.archive.util.ObjectIdentityMemCache;
import junit.framework.Assert;
/**
* Unit test for {@link QuotaEnforcer}.
*
* @contributor pjack
* @contributor nlevitt
*/
public class QuotaEnforcerTest extends CrawlerProcessorTestBase {
static class MockFetchStats extends FetchStats {
private static final long serialVersionUID = 1l;
public void setNovelBytes(long n) {
put(NOVEL, n);
}
public void setNovelUrls(long n) {
put(NOVELCOUNT, n);
}
}
static class MockServerCache extends DefaultServerCache {
private static final long serialVersionUID = 1l;
public void setHostFor(String host, CrawlHost crawlHost) {
((ObjectIdentityMemCache<CrawlHost>) hosts).getMap().put(host, crawlHost);
}
public void setServerFor(String h, CrawlServer crawlServer) {
((ObjectIdentityMemCache<CrawlServer>) servers).getMap().put(h, crawlServer);
}
}
interface CanSetSubstats {
public void setSubstats(FetchStats stats);
}
static class MockCrawlHost extends CrawlHost implements CanSetSubstats {
private static final long serialVersionUID = 1l;
public MockCrawlHost(String hostname) {
super(hostname);
}
@Override
public void setSubstats(FetchStats stats) {
this.substats = stats;
}
}
static class MockCrawlServer extends CrawlServer implements CanSetSubstats {
private static final long serialVersionUID = 1l;
public MockCrawlServer(String h) {
super(h);
}
@Override
public void setSubstats(FetchStats stats) {
this.substats = stats;
}
}
static class MockFrontierGroup implements FrontierGroup, CanSetSubstats {
private static final long serialVersionUID = 1l;
protected FetchStats substats = new FetchStats();
// the only method that's used in this mock class
@Override
public FetchStats getSubstats() {
return substats;
}
@Override
public void setSubstats(FetchStats stats) {
this.substats = stats;
}
@Override
public void tally(CrawlURI curi, Stage stage) {
}
@Override
public void setIdentityCache(ObjectIdentityCache<?> cache) {
}
@Override
public String getKey() {
return null;
}
@Override
public void makeDirty() {
}
}
static class MockFrontier implements Frontier {
protected Map<String,MockFrontierGroup> hostGroups = new HashMap<String,MockFrontierGroup>();
// the only method that's used in this mock class
@Override
public FrontierGroup getGroup(CrawlURI curi) {
String host;
try {
host = curi.getUURI().getHost();
MockFrontierGroup group = hostGroups.get(host);
if (group == null) {
group = new MockFrontierGroup();
hostGroups.put(host, group);
}
return group;
} catch (URIException e) {
Assert.fail();
return null;
}
}
@Override
public void start() {
}
@Override
public void stop() {
}
@Override
public boolean isRunning() {
return false;
}
@Override
public void reportTo(PrintWriter writer) throws IOException {
}
@Override
public void shortReportLineTo(PrintWriter pw) throws IOException {
}
@Override
public Map<String, Object> shortReportMap() {
return null;
}
@Override
public String shortReportLegend() {
return null;
}
@Override
public CrawlURI next() throws InterruptedException {
return null;
}
@Override
public boolean isEmpty() {
return false;
}
@Override
public void schedule(CrawlURI caURI) {
}
@Override
public void finished(CrawlURI cURI) {
}
@Override
public long discoveredUriCount() {
return 0;
}
@Override
public long queuedUriCount() {
return 0;
}
@Override
public long futureUriCount() {
return 0;
}
@Override
public long deepestUri() {
return 0;
}
@Override
public long averageDepth() {
return 0;
}
@Override
public float congestionRatio() {
return 0;
}
@Override
public long finishedUriCount() {
return 0;
}
@Override
public long succeededFetchCount() {
return 0;
}
@Override
public long failedFetchCount() {
return 0;
}
@Override
public long disregardedUriCount() {
return 0;
}
@Override
public void importURIs(String params) throws IOException {
}
@Override
public long importRecoverFormat(File source, boolean applyScope, boolean includeOnly, boolean forceFetch,
String acceptTags) throws IOException {
return 0;
}
@Override
public CompositeData getURIsList(String marker, int numberOfMatches, String regex, boolean verbose) {
return null;
}
@Override
public long deleteURIs(String queueRegex, String match) {
return 0;
}
@Override
public void deleted(CrawlURI curi) {
}
@Override
public void considerIncluded(CrawlURI curi) {
}
@Override
public void pause() {
}
@Override
public void unpause() {
}
@Override
public void terminate() {
}
@Override
public FrontierJournal getFrontierJournal() {
return null;
}
@Override
public String getClassKey(CrawlURI cauri) {
return null;
}
@Override
public DecideRule getScope() {
return null;
}
@Override
public void run() {
}
@Override
public void requestState(State target) {
}
@Override
public void beginDisposition(CrawlURI curi) {
}
@Override
public void endDisposition() {
}
}
// separate methods to make it easier to know what failed
public void testHostNovelKbForceRetire() throws URIException, InterruptedException {
testNovel("kb", "host", true);
}
public void testServerNovelKbForceRetire() throws URIException, InterruptedException {
testNovel("kb", "server", true);
}
public void testGroupNovelKbForceRetire() throws URIException, InterruptedException {
testNovel("kb", "group", true);
}
public void testHostNovelKbNoForceRetire() throws URIException, InterruptedException {
testNovel("kb", "host", false);
}
public void testServerNovelKbNoForceRetire() throws URIException, InterruptedException {
testNovel("kb", "server", false);
}
public void testGroupNovelKbNoForceRetire() throws URIException, InterruptedException {
testNovel("kb", "group", false);
}
public void testHostNovelUrlsForceRetire() throws URIException, InterruptedException {
testNovel("urls", "host", true);
}
public void testServerNovelUrlsForceRetire() throws URIException, InterruptedException {
testNovel("urls", "server", true);
}
public void testGroupNovelUrlsForceRetire() throws URIException, InterruptedException {
testNovel("urls", "group", true);
}
public void testHostNovelUrlsNoForceRetire() throws URIException, InterruptedException {
testNovel("urls", "host", false);
}
public void testServerNovelUrlsNoForceRetire() throws URIException, InterruptedException {
testNovel("urls", "server", false);
}
public void testGroupNovelUrlsNoForceRetire() throws URIException, InterruptedException {
testNovel("urls", "group", false);
}
protected void testNovel(String urlsOrKb, String hostServerOrGroup, boolean forceRetire) throws URIException, InterruptedException {
QuotaEnforcer qe = new QuotaEnforcer();
MockServerCache serverCache = new MockServerCache();
qe.setServerCache(serverCache);
MockFrontier frontier = new MockFrontier();
qe.setFrontier(frontier);
qe.setForceRetire(forceRetire);
// sanity check
assertTrue("urls".equals(urlsOrKb) || "kb".equals(urlsOrKb));
assertTrue("host".equals(hostServerOrGroup) || "server".equals(hostServerOrGroup) || "group".equals(hostServerOrGroup));
if ("host".equals(hostServerOrGroup)) {
if ("urls".equals(urlsOrKb)) {
qe.setHostMaxNovelUrls(1);
assertEquals(1, qe.getHostMaxNovelUrls());
} else if ("kb".equals(urlsOrKb)) {
qe.setHostMaxNovelKb(100);
assertEquals(100, qe.getHostMaxNovelKb());
}
} else if ("server".equals(hostServerOrGroup)) {
if ("urls".equals(urlsOrKb)) {
qe.setServerMaxNovelUrls(1);
assertEquals(1, qe.getServerMaxNovelUrls());
} else {
qe.setServerMaxNovelKb(100);
assertEquals(100, qe.getServerMaxNovelKb());
}
} else if ("group".equals(hostServerOrGroup)) {
if ("urls".equals(urlsOrKb)) {
qe.setGroupMaxNovelUrls(1);
assertEquals(1, qe.getGroupMaxNovelUrls());
} else {
qe.setGroupMaxNovelKb(100);
assertEquals(100, qe.getGroupMaxNovelKb());
}
}
// nothing accumulated yet
CrawlURI curi = new CrawlURI(UURIFactory.getInstance("http://example.com/1"));
ProcessResult res = qe.process(curi);
assertEquals(ProcessResult.PROCEED, res);
if (forceRetire) {
assertNull(curi.getData().get(CoreAttributeConstants.A_FORCE_RETIRE));
} else {
assertEquals(FetchStatusCodes.S_UNATTEMPTED, curi.getFetchStatus());
}
// we do all this to set only the stats value we're testing, to avoid
// quotas checking the wrong thing but tests passing anyway
CanSetSubstats thing;
if ("host".equals(hostServerOrGroup)) {
thing = new MockCrawlHost("example.com");
serverCache.setHostFor("example.com", (CrawlHost) thing);
} else if ("server".equals(hostServerOrGroup)) {
thing = new MockCrawlServer("example.com");
serverCache.setServerFor("example.com", (CrawlServer) thing);
} else {
thing = (MockFrontierGroup) frontier.getGroup(curi);
}
MockFetchStats stats = new MockFetchStats();
if ("urls".equals(urlsOrKb)) {
stats.setNovelUrls(1);
} else {
stats.setNovelBytes(200000);
}
thing.setSubstats(stats);
// another uri from same host should hit quota
curi = new CrawlURI(UURIFactory.getInstance("http://example.com/2"));
res = qe.process(curi);
assertEquals(ProcessResult.FINISH, res);
if (forceRetire) {
assertTrue((Boolean) curi.getData().get(CoreAttributeConstants.A_FORCE_RETIRE));
} else {
assertEquals(FetchStatusCodes.S_BLOCKED_BY_QUOTA, curi.getFetchStatus());
}
// some other host has not hit quota
curi = new CrawlURI(UURIFactory.getInstance("http://example.org/"));
res = qe.process(curi);
assertEquals(ProcessResult.PROCEED, res);
if (forceRetire) {
assertNull(curi.getData().get(CoreAttributeConstants.A_FORCE_RETIRE));
} else {
assertEquals(FetchStatusCodes.S_UNATTEMPTED, curi.getFetchStatus());
}
}
}