Precedence4SelfTest.java example

Explorer
heritrix3-master
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.archive.crawler.selftest;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.io.IOUtils;

/**
 * Tests that operators can manually assign precedence values to individual 
 * URLs.
 * 
 * <p>This class crawls the same directory structure as 
 * {@link Precedence1SelfTest}, using the same number of sheets.  However, 
 * insteading of creating groups of URIs using SURT prefixes, the HiPri and
 * LoPri sheets are assigned to two individual URIs.  The test then assures
 * that the HiPri URI is crawled before anything else, and that the LoPri
 * URL is crawled after everything else.
 * 
 * @author pjack
 */
public class Precedence4SelfTest extends Precedence1SelfTest {

    @Override
    protected void verify() throws Exception {
        File crawlLog = new File(getLogsDir(), "crawl.log");
        BufferedReader br = null;
        List<String> crawled = new ArrayList<String>();
        try {
            br = new BufferedReader(new FileReader(crawlLog));
            for (String s = br.readLine(); s != null; s = br.readLine()) {
                s = s.substring(42);
                int i = s.indexOf(' ');
                s = s.substring(0, i);
                crawled.add(s);
            }
        } finally {
            IOUtils.closeQuietly(br);
        }
        
        assertEquals("http://127.0.0.1:7777/robots.txt", crawled.get(0));
        assertEquals("http://127.0.0.1:7777/five/a.html", crawled.get(1));
        assertEquals("http://127.0.0.1:7777/five/b.html", crawled.get(crawled.size() - 1));
    }

    
    
    protected String getSeedsString() {
        return "http://127.0.0.1:7777/seed.html\\n"+
            "http://127.0.0.1:7777/one/a.html\\n"+
            "http://127.0.0.1:7777/five/a.html\\n"+
            "http://127.0.0.1:7777/ten/a.html\\n"+
            "http://127.0.0.1:7777/ten/b.html\\n"+
            "http://127.0.0.1:7777/five/b.html\\n"+
            "http://127.0.0.1:7777/one/b.html\\n"+
            "http://127.0.0.1:7777/five/c.html\\n"+
            "http://127.0.0.1:7777/one/c.html\\n"+
            "http://127.0.0.1:7777/ten/c.html";
    }
    
    protected String configureSheets(String config) {
        // add sheets which overlay alternate precedence values for two
        // specific URIs
        String sheets = 
            "<bean class='org.archive.crawler.spring.SurtPrefixesSheetAssociation'>\n" +
            " <property name='surtPrefixes'>\n" +
            "  <list>\n" +
            "   <value>http://(127.0.0.1:7777)/five/b.html</value>\n" +
            "  </list>\n" +
            " </property>\n" +
            " <property name='targetSheetNames'>\n" +
            "  <list>\n" +
            "   <value>loPri</value>\n" +
            "  </list>\n" +
            " </property>\n" +
            "</bean>\n" +
            "<bean id='loPri' class='org.archive.spring.Sheet'>\n" +
            " <property name='map'>\n" +
            "  <map>\n" +
            "   <entry key='preparer.uriPrecedencePolicy.basePrecedence' value='10'/>\n" +
            "  </map>\n" +
            " </property>\n" +
            "</bean>\n" +
            "<bean class='org.archive.crawler.spring.SurtPrefixesSheetAssociation'>\n" +
            " <property name='surtPrefixes'>\n" +
            "  <list>\n" +
            "   <value>http://(127.0.0.1:7777)/five/a.html</value>\n" +
            "  </list>\n" +
            " </property>\n" +
            " <property name='targetSheetNames'>\n" +
            "  <list>\n" +
            "   <value>hiPri</value>\n" +
            "  </list>\n" +
            " </property>\n" +
            "</bean>\n" +
            "<bean id='hiPri' class='org.archive.spring.Sheet'>\n" +
            " <property name='map'>\n" +
            "  <map>\n" +
            "   <entry key='preparer.uriPrecedencePolicy.basePrecedence' value='1'/>\n" +
            "  </map>\n" +
            " </property>\n" +
            "</bean>\n";

        config = config.replace("</beans>", sheets+"</beans>");
        return config;
    }
}