package org.archive.crawler.selftest; import java.io.File; import org.archive.crawler.frontier.precedence.PrecedenceLoader; /** * Tests that precedence values for URIs can be imported from an offline * analysis. This test crawls the same directory structure as * {@link PrecedenceSelfTest1} and expects the URIs to be crawled in the same * order. However, the result is achieved using a * {@link org.archive.crawler.frontier.precedence.PreloadedUriPrecedencePolicy} * to load per-URI precedence information from an external file. * * <p>Such a file could be generated from PageRank analysis of a previously * completed crawl; see {@link http://webteam.archive.org/confluence/display/Heritrix/Offline+PageRank+Analysis+Notes}. * (For this minimal functional test, the PreloadedUriPrecedencePolicy input * file was simply hand-generated.) * * @author pjack */ public class Precedence2SelfTest extends Precedence1SelfTest { @Override protected String changeGlobalConfig(String config) { // add an autowired uriPrecedencePolicy with preloaded values String uriPrecedencePolicy = " <bean id='uriPrecedencePolicy' class='org.archive.crawler.frontier.precedence.PreloadedUriPrecedencePolicy'>\n" + " <property name='basePrecedence' value='5'/>\n" + " </bean>"; config = config.replace("<!--@@BEANS_MOREBEANS@@-->", uriPrecedencePolicy); // suppress superclass insertion of inner bean policy config = config.replace("<!--@@FRONTIER_PROPERTIES@@-->", ""); return super.changeGlobalConfig(config); } @Override protected void configureHeritrix() throws Exception { File src = new File(getJobDir(), "rank.txt"); File dest = new File(getJobDir(), "state"); String[] args = new String[] { src.getAbsolutePath(), dest.getAbsolutePath() }; PrecedenceLoader.main(args); } }