/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.crawler.selftest;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.io.IOUtils;
/**
* Tests that operators can manually assign precedence values to individual
* URLs.
*
* <p>This class crawls the same directory structure as
* {@link Precedence1SelfTest}, using the same number of sheets. However,
* insteading of creating groups of URIs using SURT prefixes, the HiPri and
* LoPri sheets are assigned to two individual URIs. The test then assures
* that the HiPri URI is crawled before anything else, and that the LoPri
* URL is crawled after everything else.
*
* @author pjack
*/
public class Precedence4SelfTest extends Precedence1SelfTest {
@Override
protected void verify() throws Exception {
File crawlLog = new File(getLogsDir(), "crawl.log");
BufferedReader br = null;
List<String> crawled = new ArrayList<String>();
try {
br = new BufferedReader(new FileReader(crawlLog));
for (String s = br.readLine(); s != null; s = br.readLine()) {
s = s.substring(42);
int i = s.indexOf(' ');
s = s.substring(0, i);
crawled.add(s);
}
} finally {
IOUtils.closeQuietly(br);
}
assertEquals("http://127.0.0.1:7777/robots.txt", crawled.get(0));
assertEquals("http://127.0.0.1:7777/five/a.html", crawled.get(1));
assertEquals("http://127.0.0.1:7777/five/b.html", crawled.get(crawled.size() - 1));
}
protected String getSeedsString() {
return "http://127.0.0.1:7777/seed.html\\n"+
"http://127.0.0.1:7777/one/a.html\\n"+
"http://127.0.0.1:7777/five/a.html\\n"+
"http://127.0.0.1:7777/ten/a.html\\n"+
"http://127.0.0.1:7777/ten/b.html\\n"+
"http://127.0.0.1:7777/five/b.html\\n"+
"http://127.0.0.1:7777/one/b.html\\n"+
"http://127.0.0.1:7777/five/c.html\\n"+
"http://127.0.0.1:7777/one/c.html\\n"+
"http://127.0.0.1:7777/ten/c.html";
}
protected String configureSheets(String config) {
// add sheets which overlay alternate precedence values for two
// specific URIs
String sheets =
"<bean class='org.archive.crawler.spring.SurtPrefixesSheetAssociation'>\n" +
" <property name='surtPrefixes'>\n" +
" <list>\n" +
" <value>http://(127.0.0.1:7777)/five/b.html</value>\n" +
" </list>\n" +
" </property>\n" +
" <property name='targetSheetNames'>\n" +
" <list>\n" +
" <value>loPri</value>\n" +
" </list>\n" +
" </property>\n" +
"</bean>\n" +
"<bean id='loPri' class='org.archive.spring.Sheet'>\n" +
" <property name='map'>\n" +
" <map>\n" +
" <entry key='preparer.uriPrecedencePolicy.basePrecedence' value='10'/>\n" +
" </map>\n" +
" </property>\n" +
"</bean>\n" +
"<bean class='org.archive.crawler.spring.SurtPrefixesSheetAssociation'>\n" +
" <property name='surtPrefixes'>\n" +
" <list>\n" +
" <value>http://(127.0.0.1:7777)/five/a.html</value>\n" +
" </list>\n" +
" </property>\n" +
" <property name='targetSheetNames'>\n" +
" <list>\n" +
" <value>hiPri</value>\n" +
" </list>\n" +
" </property>\n" +
"</bean>\n" +
"<bean id='hiPri' class='org.archive.spring.Sheet'>\n" +
" <property name='map'>\n" +
" <map>\n" +
" <entry key='preparer.uriPrecedencePolicy.basePrecedence' value='1'/>\n" +
" </map>\n" +
" </property>\n" +
"</bean>\n";
config = config.replace("</beans>", sheets+"</beans>");
return config;
}
}