Precedence3SelfTest.java example

Explorer
heritrix3-master
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
 
package org.archive.crawler.selftest;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.io.IOUtils;

/**
 * Tests that URLs can be assigned precedence values based on in-line analysis
 * module that prioritizes newly discovered links based on an approximate
 * ranking during a crawl.
 * 
 * <p>The test data consists of 15 documents, titled A through O.  Each document
 * links to two other documents, forming a sorted, balanced binary tree:
 * 
 * <ul>
 * <li>H</li>
 *     <ul>
 *     <li>D</li>
 *         <ul>
 *         <li>B</li>
 *             <ul>
 *             <li>A</li>
 *             <li>C</li>
 *             </ul>
 *         <li>F</li>
 *             <ul>
 *             <li>E</li>
 *             <li>G</li>
 *             </ul>
 *         </ul>
 *     <li>L</li>
 *         <ul>
 *         <li>J</li>
 *             <ul>
 *             <li>I</li>
 *             <li>K</li>
 *             </ul>
 *         <li>N</li>
 *             <ul>
 *             <li>M</li>
 *             <li>O</li>
 *             </ul>
 *         </ul>
 *     </ul>
 * </ul>
 * 
 * <p>If H is the seed, then Heritrix would ordinarily crawl these in the order
 * <code>(H, L, D, J, N, F, B, K, I, M, O, G, E, C, A)</code> -- loosely the
 * order the links were discovered.
 * 
 * <p>However, this test uses the {@link KeyWordProcessor} to ensure that, if
 * a document contains a certain keyword, then that document's out links are 
 * crawled before the out links of documents that do not contain the keyword.
 * 
 * <p>The documents A, B, D, E, H, I, J and M all contain the keyword (these
 * are the "left-branch"/first-link documents in the tree above, plus the 
 * root/seed).  The other documents do not.
 * 
 * <p>Therefore this test makes sure that the children of documents containing
 * the keyword are crawled before children of documents not containing the 
 * keyword:
 * 
 * <ol>
 * <li>The children of D (B and F) should be crawled before the children 
 * of L (J and N).</li>
 * <li>The children of B (A and C) should be crawled before the children 
 * of F (E and G).</li>
 * <li>The children of J (I and K) should be crawled before the children of
 * N (M and O).
 * </ol>
 * 
 * <p>This test provides a simple proof-of-concept that shows how the content
 * of one URI can alter the precedence of the out links of that URI.  See
 * {@link KeyWordProcessor} for suggestions on more sophisticated approaches.
 * 
 * @author pjack
 */
public class Precedence3SelfTest extends SelfTestBase {

    
    @Override
    protected void verify() throws Exception {
        File crawlLog = new File(getLogsDir(), "crawl.log");
        BufferedReader br = null;
        List<String> crawled = new ArrayList<String>();

        try {
            br = new BufferedReader(new FileReader(crawlLog));
            for (String s = br.readLine(); s != null; s = br.readLine()) {
                s = s.substring(42);
                int i = s.indexOf(' ');
                s = s.substring(0, i);
                crawled.add(s);
            }
        } finally {
            IOUtils.closeQuietly(br);
        }
        
        System.out.println(crawled);
      
        assertEquals("http://127.0.0.1:7777/robots.txt", crawled.get(0));
        assertEquals("http://127.0.0.1:7777/H.html", crawled.get(1));

        // D contains the keyword and L does not.
        // D's children (B and F) should be crawled before L's (J and N).
        assertBefore(crawled, 'B', 'F', 'J', 'N');
        
        // B contains the keyword and F does not.
        // B's children (A and C) should be crawled before F's (E and G).        
        assertBefore(crawled, 'A', 'C', 'E', 'G');
        
        // J contains the keyword and N does not.
        // J's children (I and K) should be crawled before N's (M and O).
        assertBefore(crawled, 'I', 'K', 'M', 'O');
    }
    
    
    private boolean assertBefore(List<String> crawled, 
            char k1, char k2, char n1, char n2) {
        int k1Index = crawled.indexOf(toFullURI(k1));
        int k2Index = crawled.indexOf(toFullURI(k2));
        int n1Index = crawled.indexOf(toFullURI(n1));
        int n2Index = crawled.indexOf(toFullURI(n2));
        // Make sure all four documents were actually crawled.
        assertTrue(k1Index > 0);
        assertTrue(k2Index > 0);
        assertTrue(n1Index > 0);
        assertTrue(n2Index > 0);

        // Make sure children of keyword-containing-parent were crawled before
        // children of no-keyword-containing-parent.
        assertTrue(k1Index + " >= " + n1Index, k1Index < n1Index);
        assertTrue(k1Index + " >= " + n2Index, k1Index < n2Index);
        assertTrue(k2Index + " >= " + n1Index, k2Index < n1Index);
        assertTrue(k2Index + " >= " + n1Index, k2Index < n2Index);
        
        return false;
    }
    
    private String toFullURI(char ch) {
        return "http://127.0.0.1:7777/" + ch + ".html";
    }

    protected String getSeedsString() {
        return "http://127.0.0.1:7777/H.html";
    }
    
    @Override
    protected String changeGlobalConfig(String config) {
        // single toethread
        config = config.replace("@@MORE_PROPERTIES@@", "crawlController.maxToeThreads=1");
        
        // add the keyword-based uriPrecedencePolicy
        String uriPrecedencePolicy = 
            "<bean class='org.archive.crawler.selftest.KeyWordUriPrecedencePolicy'>\n" +
            "  <property name='basePrecedence' value='5'/>\n" +
            " </bean>";
        config = config.replace("<!--@@BEANS_MOREBEANS@@-->", uriPrecedencePolicy);

        // add the keyword processor after candidateScoper
        config = config.replace(
                "<ref bean=\"candidateScoper\"/>", 
                "<ref bean=\"candidateScoper\"/>\n" +
                "<bean class='org.archive.crawler.selftest.KeyWordProcessor'/>");
        return super.changeGlobalConfig(config);
    }
}