/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.crawler.selftest;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import org.archive.crawler.frontier.precedence.BaseUriPrecedencePolicy;
import org.archive.util.ArchiveUtils;
/**
* Tests that operators can create precedence groups for URIs, and that URIs
* in one group are crawled before URIs in another group per operator preference.
*
* <p>The embedded Jetty HTTP server for this test provides the following
* document tree:
*
* <ul>
* <li>seed.html</li>
* <li>one/</li>
* <ul>
* <li>a.html</li>
* <li>b.html</li>
* <li>c.html</li>
* </ul>
* <li>five/</li>
* <ul>
* <li>a.html</li>
* <li>b.html</li>
* <li>c.html</li>
* </ul>
* <li>ten/</li>
* <ul>
* <li>a.html</li>
* <li>b.html</li>
* <li>c.html</li>
* </ul>
* </ul>
*
* (See the <code>engine/testdata/selftest/Precedence1SelfTest</code>
* directory to view these files.) The <code>seed.html</code> file contains
* links to <code>five/a.html</code>, <code>ten/a.html</code>, and
* <code>one/a.html</code>, in that order. The <code>a.html</code> files link
* to to the <code>b.html</code> files, and the <code>b.html</code> link to
* the <code>c.html</code> files, which have no out links.
*
* <p>Ordinarily Heritrix would crawl these in (roughly) the order the links
* are discovered:
*
* <ol>
* <li>seed.html</li>
* <li>five/a.html</li>
* <li>ten/a.html</li>
* <li>one/a.html</li>
* <li>five/b.html</li>
* <li>ten/b.html</li>
* <li>one/b.html</li>
* <li>five/c.html</li>
* <li>ten/c.html</li>
* <li>one/c.html</li>
* </ol>
*
* <p>However, the crawl configuration for this test uses a
* {@link BaseUriPrecedencePolicy} instead of the default
* {@link org.archive.crawler.frontier.policy.CostUriPrecedencePolicy}. The
* <code>BasePrecedencePolicy</code> is configured so that all URIs have a
* precedence value of 5 unless otherwise specified.
*
* <p>There is a sheet named <code>HiPri</code> that overrides the
* <code>base-precedence</code> to be 1 instead of 5; thus URIs associated
* with the HiPri sheet should be crawled before other URIs.
* Similarly, there is a sheet named <code>LoPri</code> that overrides
* <code>base-precedence</code> to be 10 instead of 5. URLs associated with
* LoPri should be crawled after other URLs.
*
* <p>The <code>one/</code> directory is associated with the HiPri sheet, and
* the <code>ten/</code> directory is associated with the LoPri sheet. This
* creates three "groups" of URIs: one, five and ten. All of the URIs in
* group "one" should be crawled before any of the URIs in group "five" are
* crawled. Similarly, all of the URIs in group "five" should be crawled before
* any of the URIs in group "ten".
*
* <p>So the final order in which URLs should be crawled in this test is:
*
* <ol>
* <li>seed.html</li>
* <li>one/a.html</li>
* <li>one/b.html</li>
* <li>one/c.html</li>
* <li>five/a.html</li>
* <li>five/b.html</li>
* <li>five/c.html</li>
* <li>ten/a.html</li>
* <li>ten/b.html</li>
* <li>ten/c.html</li>
* </ol>
*
* This tests ensures that the documents were crawled in the correct order.
*
* <p>Although this test uses the directory structure of the URIs to group the URIs
* into precedence groups, because the test executes on just one machine.
* But the same basic configuration could be used to group URIs by any SURT
* prefix -- by host or by domain, even by top-level domain. So an operator
* could associate HiPri with all .gov sites to ensure that all .gov URIs
* are crawled before any non-.gov URIs.
*
* @author pjack
*/
public class Precedence1SelfTest extends SelfTestBase {
/**
* Expected results of the crawl.
*/
final private static String EXPECTED =
"http://127.0.0.1:7777/robots.txt\n" +
"http://127.0.0.1:7777/seed.html\n" +
"http://127.0.0.1:7777/favicon.ico\n" +
"http://127.0.0.1:7777/one/a.html\n" +
"http://127.0.0.1:7777/one/b.html\n" +
"http://127.0.0.1:7777/one/c.html\n" +
"http://127.0.0.1:7777/five/a.html\n" +
"http://127.0.0.1:7777/five/b.html\n" +
"http://127.0.0.1:7777/five/c.html\n" +
"http://127.0.0.1:7777/ten/a.html\n" +
"http://127.0.0.1:7777/ten/b.html\n" +
"http://127.0.0.1:7777/ten/c.html\n";
@Override
protected void verify() throws Exception {
File crawlLog = new File(getLogsDir(), "crawl.log");
BufferedReader br = null;
String crawled = "";
try {
br = new BufferedReader(new FileReader(crawlLog));
for (String s = br.readLine(); s != null; s = br.readLine()) {
s = s.substring(42);
int i = s.indexOf(' ');
s = s.substring(0, i);
crawled = crawled + s + "\n";
}
} finally {
ArchiveUtils.closeQuietly(br);
}
assertEquals(EXPECTED, crawled);
}
protected String getSeedsString() {
return "http://127.0.0.1:7777/seed.html";
}
@Override
protected String changeGlobalConfig(String config) {
// add a uriPrecedencePolicy with overlayable values, IF replaced
// string not already gone (as if by subclass)
String uriPrecedencePolicy =
" <bean name=\'uriPrecedencePolicy\' class='org.archive.crawler.frontier.precedence.BaseUriPrecedencePolicy'>\n" +
" <property name='basePrecedence' value='5'/>\n" +
" </bean>";
config = config.replace("<!--@@BEANS_MOREBEANS@@-->", uriPrecedencePolicy);
config = configureSheets(config);
return super.changeGlobalConfig(config);
}
protected String configureSheets(String config) {
// add sheets which overlay alternate precedence values for some URIs
String sheets =
"<bean class='org.archive.crawler.spring.SurtPrefixesSheetAssociation'>\n" +
" <property name='surtPrefixes'>\n" +
" <list>\n" +
" <value>http://(127.0.0.1:7777)/ten</value>\n" +
" </list>\n" +
" </property>\n" +
" <property name='targetSheetNames'>\n" +
" <list>\n" +
" <value>loPri</value>\n" +
" </list>\n" +
" </property>\n" +
"</bean>\n" +
"<bean id='loPri' class='org.archive.spring.Sheet'>\n" +
" <property name='map'>\n" +
" <map>\n" +
" <entry key='preparer.uriPrecedencePolicy.basePrecedence' value='10'/>\n" +
" </map>\n" +
" </property>\n" +
"</bean>\n" +
"<bean class='org.archive.crawler.spring.SurtPrefixesSheetAssociation'>\n" +
" <property name='surtPrefixes'>\n" +
" <list>\n" +
" <value>http://(127.0.0.1:7777)/one</value>\n" +
" </list>\n" +
" </property>\n" +
" <property name='targetSheetNames'>\n" +
" <list>\n" +
" <value>hiPri</value>\n" +
" </list>\n" +
" </property>\n" +
"</bean>\n" +
"<bean id='hiPri' class='org.archive.spring.Sheet'>\n" +
" <property name='map'>\n" +
" <map>\n" +
" <entry key='preparer.uriPrecedencePolicy.basePrecedence' value='1'/>\n" +
" </map>\n" +
" </property>\n" +
"</bean>\n";
config = config.replace("</beans>", sheets+"</beans>");
return config;
}
}