/* * This file is part of the Heritrix web crawler (crawler.archive.org). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.crawler.selftest; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.util.ArrayList; import java.util.List; import org.apache.commons.io.IOUtils; /** * Tests that URLs can be assigned precedence values based on in-line analysis * module that prioritizes newly discovered links based on an approximate * ranking during a crawl. * * <p>The test data consists of 15 documents, titled A through O. Each document * links to two other documents, forming a sorted, balanced binary tree: * * <ul> * <li>H</li> * <ul> * <li>D</li> * <ul> * <li>B</li> * <ul> * <li>A</li> * <li>C</li> * </ul> * <li>F</li> * <ul> * <li>E</li> * <li>G</li> * </ul> * </ul> * <li>L</li> * <ul> * <li>J</li> * <ul> * <li>I</li> * <li>K</li> * </ul> * <li>N</li> * <ul> * <li>M</li> * <li>O</li> * </ul> * </ul> * </ul> * </ul> * * <p>If H is the seed, then Heritrix would ordinarily crawl these in the order * <code>(H, L, D, J, N, F, B, K, I, M, O, G, E, C, A)</code> -- loosely the * order the links were discovered. * * <p>However, this test uses the {@link KeyWordProcessor} to ensure that, if * a document contains a certain keyword, then that document's out links are * crawled before the out links of documents that do not contain the keyword. * * <p>The documents A, B, D, E, H, I, J and M all contain the keyword (these * are the "left-branch"/first-link documents in the tree above, plus the * root/seed). The other documents do not. * * <p>Therefore this test makes sure that the children of documents containing * the keyword are crawled before children of documents not containing the * keyword: * * <ol> * <li>The children of D (B and F) should be crawled before the children * of L (J and N).</li> * <li>The children of B (A and C) should be crawled before the children * of F (E and G).</li> * <li>The children of J (I and K) should be crawled before the children of * N (M and O). * </ol> * * <p>This test provides a simple proof-of-concept that shows how the content * of one URI can alter the precedence of the out links of that URI. See * {@link KeyWordProcessor} for suggestions on more sophisticated approaches. * * @author pjack */ public class Precedence3SelfTest extends SelfTestBase { @Override protected void verify() throws Exception { File crawlLog = new File(getLogsDir(), "crawl.log"); BufferedReader br = null; List<String> crawled = new ArrayList<String>(); try { br = new BufferedReader(new FileReader(crawlLog)); for (String s = br.readLine(); s != null; s = br.readLine()) { s = s.substring(42); int i = s.indexOf(' '); s = s.substring(0, i); crawled.add(s); } } finally { IOUtils.closeQuietly(br); } System.out.println(crawled); assertEquals("http://127.0.0.1:7777/robots.txt", crawled.get(0)); assertEquals("http://127.0.0.1:7777/H.html", crawled.get(1)); // D contains the keyword and L does not. // D's children (B and F) should be crawled before L's (J and N). assertBefore(crawled, 'B', 'F', 'J', 'N'); // B contains the keyword and F does not. // B's children (A and C) should be crawled before F's (E and G). assertBefore(crawled, 'A', 'C', 'E', 'G'); // J contains the keyword and N does not. // J's children (I and K) should be crawled before N's (M and O). assertBefore(crawled, 'I', 'K', 'M', 'O'); } private boolean assertBefore(List<String> crawled, char k1, char k2, char n1, char n2) { int k1Index = crawled.indexOf(toFullURI(k1)); int k2Index = crawled.indexOf(toFullURI(k2)); int n1Index = crawled.indexOf(toFullURI(n1)); int n2Index = crawled.indexOf(toFullURI(n2)); // Make sure all four documents were actually crawled. assertTrue(k1Index > 0); assertTrue(k2Index > 0); assertTrue(n1Index > 0); assertTrue(n2Index > 0); // Make sure children of keyword-containing-parent were crawled before // children of no-keyword-containing-parent. assertTrue(k1Index + " >= " + n1Index, k1Index < n1Index); assertTrue(k1Index + " >= " + n2Index, k1Index < n2Index); assertTrue(k2Index + " >= " + n1Index, k2Index < n1Index); assertTrue(k2Index + " >= " + n1Index, k2Index < n2Index); return false; } private String toFullURI(char ch) { return "http://127.0.0.1:7777/" + ch + ".html"; } protected String getSeedsString() { return "http://127.0.0.1:7777/H.html"; } @Override protected String changeGlobalConfig(String config) { // single toethread config = config.replace("@@MORE_PROPERTIES@@", "crawlController.maxToeThreads=1"); // add the keyword-based uriPrecedencePolicy String uriPrecedencePolicy = "<bean class='org.archive.crawler.selftest.KeyWordUriPrecedencePolicy'>\n" + " <property name='basePrecedence' value='5'/>\n" + " </bean>"; config = config.replace("<!--@@BEANS_MOREBEANS@@-->", uriPrecedencePolicy); // add the keyword processor after candidateScoper config = config.replace( "<ref bean=\"candidateScoper\"/>", "<ref bean=\"candidateScoper\"/>\n" + "<bean class='org.archive.crawler.selftest.KeyWordProcessor'/>"); return super.changeGlobalConfig(config); } }