package focusedCrawler.link.frontier; import static org.hamcrest.CoreMatchers.is; import static org.hamcrest.CoreMatchers.notNullValue; import static org.junit.Assert.assertThat; import java.io.IOException; import java.net.URL; import java.util.ArrayList; import java.util.HashMap; import java.util.Map; import org.junit.After; import org.junit.Before; import org.junit.Rule; import org.junit.Test; import org.junit.rules.TemporaryFolder; import focusedCrawler.link.frontier.selector.LinkSelector; import focusedCrawler.link.frontier.selector.RandomLinkSelector; import focusedCrawler.link.frontier.selector.TopkLinkSelector; import focusedCrawler.util.DataNotFoundException; import focusedCrawler.util.LinkFilter; import focusedCrawler.util.MetricsManager; public class FrontierManagerTest { @Rule // a new temp folder is created for each test case public TemporaryFolder tempFolder = new TemporaryFolder(); private LinkFilter emptyLinkFilter = new LinkFilter(new ArrayList<String>()); private MetricsManager metricsManager = new MetricsManager(); private Frontier frontier; private String dataPath; private boolean downloadRobots = false; private int minimumAccessTimeInterval = 0; @Before public void setUp() throws Exception { frontier = new Frontier(tempFolder.newFolder().toString(), 1000); dataPath = tempFolder.newFolder().toString(); } @After public void tearDown() throws IOException { } @Test public void shouldNotInsertLinkOutOfScope() throws Exception { // given LinkRelevance link1 = new LinkRelevance(new URL("http://www.example1.com/index.html"), 1); LinkRelevance link2 = new LinkRelevance(new URL("http://www.example2.com/index.html"), 2); Map<String, Integer> scope = new HashMap<String, Integer>(); scope.put("www.example1.com", -1); LinkSelector linkSelector = new RandomLinkSelector(); Frontier frontier = new Frontier(tempFolder.newFolder().toString(), 1000, scope); FrontierManager frontierManager = new FrontierManager(frontier, dataPath, downloadRobots, 2, 2, minimumAccessTimeInterval, linkSelector, emptyLinkFilter, metricsManager); // when frontierManager.insert(link1); frontierManager.insert(link2); LinkRelevance selectedLink1 = frontierManager.nextURL(); DataNotFoundException notFoundException = null; try { frontierManager.nextURL(); } catch(DataNotFoundException e) { notFoundException = e; } // then assertThat(selectedLink1, is(notNullValue())); assertThat(selectedLink1.getURL(), is(notNullValue())); assertThat(selectedLink1.getURL(), is(link1.getURL())); assertThat(notFoundException, is(notNullValue())); assertThat(notFoundException.ranOutOfLinks(), is(true)); frontierManager.close(); } @Test public void shouldInsertUrl() throws Exception { // given LinkSelector linkSelector = new TopkLinkSelector(); FrontierManager frontierManager = new FrontierManager(frontier, dataPath, downloadRobots, 2, 2, minimumAccessTimeInterval, linkSelector, emptyLinkFilter, metricsManager); LinkRelevance link1 = new LinkRelevance(new URL("http://www.example1.com/index.html"), 1, LinkRelevance.Type.FORWARD); // when frontierManager.insert(link1); LinkRelevance nextURL = frontierManager.nextURL(); // then assertThat(nextURL, is(notNullValue())); assertThat(nextURL.getURL(), is(notNullValue())); assertThat(nextURL.getURL(), is(link1.getURL())); assertThat(nextURL.getRelevance(), is(link1.getRelevance())); assertThat(nextURL.getType(), is(link1.getType())); frontierManager.close(); } @Test public void shouldSelectUrlsInsertedAfterFirstSelect() throws Exception { // given int minimumAccessTimeInterval = 500; int linksToLoad = 2; int schedulerMaxLinks = 10; LinkSelector linkSelector = new TopkLinkSelector(); FrontierManager frontierManager = new FrontierManager(frontier, dataPath, downloadRobots, linksToLoad, schedulerMaxLinks, minimumAccessTimeInterval, linkSelector, emptyLinkFilter, metricsManager); LinkRelevance link1 = new LinkRelevance(new URL("http://www.example1.com/index1.html"), 1, LinkRelevance.Type.FORWARD); LinkRelevance link2 = new LinkRelevance(new URL("http://www.example1.com/index2.html"), 1, LinkRelevance.Type.FORWARD); LinkRelevance link3 = new LinkRelevance(new URL("http://www.example2.com/index2.html"), 1, LinkRelevance.Type.FORWARD); // when frontierManager.insert(link1); frontierManager.insert(link2); LinkRelevance nextUrl1 = frontierManager.nextURL(); frontierManager.insert(link3); // at this point, should not return link2, but it should return link3 // because it is from another TLD LinkRelevance nextUrl3 = frontierManager.nextURL(); // then assertThat(nextUrl1, is(notNullValue())); assertThat(nextUrl1.getURL(), is(notNullValue())); assertThat(nextUrl1.getURL(), is(link1.getURL())); assertThat(nextUrl1.getRelevance(), is(link1.getRelevance())); assertThat(nextUrl1.getType(), is(link1.getType())); assertThat(nextUrl3, is(notNullValue())); assertThat(nextUrl3.getURL(), is(notNullValue())); assertThat(nextUrl3.getURL(), is(link3.getURL())); assertThat(nextUrl3.getRelevance(), is(link3.getRelevance())); assertThat(nextUrl3.getType(), is(link3.getType())); frontierManager.close(); } @Test public void shouldInsertRobotsLinkWhenAddDomainForTheFirstTime() throws Exception { // given LinkSelector linkSelector = new TopkLinkSelector(); boolean downloadRobots = true; FrontierManager frontierManager = new FrontierManager(frontier, dataPath, downloadRobots, 2, 2, minimumAccessTimeInterval, linkSelector, emptyLinkFilter, metricsManager); LinkRelevance link1 = new LinkRelevance(new URL("http://www.example1.com/sitemap.xml"), 1, LinkRelevance.Type.FORWARD); // when frontierManager.insert(link1); // then LinkRelevance nextURL; nextURL = frontierManager.nextURL(); assertThat(nextURL, is(notNullValue())); assertThat(nextURL.getURL(), is(notNullValue())); assertThat(nextURL.getURL().toString(), is("http://www.example1.com/robots.txt")); assertThat(nextURL.getType(), is(LinkRelevance.Type.ROBOTS)); nextURL = frontierManager.nextURL(); assertThat(nextURL, is(notNullValue())); assertThat(nextURL.getURL(), is(notNullValue())); assertThat(nextURL.getURL(), is(link1.getURL())); assertThat(nextURL.getRelevance(), is(link1.getRelevance())); assertThat(nextURL.getType(), is(link1.getType())); frontierManager.close(); } @Test public void shouldInsertUrlsAndSelectUrlsInSortedByRelevance() throws Exception { // given LinkSelector linkSelector = new TopkLinkSelector(); FrontierManager frontierManager = new FrontierManager(frontier, dataPath, downloadRobots, 2, 2, minimumAccessTimeInterval, linkSelector, emptyLinkFilter, metricsManager); LinkRelevance link1 = new LinkRelevance(new URL("http://www.example1.com/index.html"), 1); LinkRelevance link2 = new LinkRelevance(new URL("http://www.example2.com/index.html"), 2); LinkRelevance link3 = new LinkRelevance(new URL("http://www.example3.com/index.html"), 3); // when frontierManager.insert(link1); frontierManager.insert(link2); frontierManager.insert(link3); LinkRelevance selectedLink1 = frontierManager.nextURL(); LinkRelevance selectedLink2 = frontierManager.nextURL(); LinkRelevance selectedLink3 = frontierManager.nextURL(); DataNotFoundException notFoundException = null; try { frontierManager.nextURL(); } catch(DataNotFoundException e) { notFoundException = e; } // then // should return only 3 inserted links, 4th should be null assertThat(selectedLink1, is(notNullValue())); assertThat(selectedLink2, is(notNullValue())); assertThat(selectedLink3, is(notNullValue())); assertThat(notFoundException, is(notNullValue())); assertThat(notFoundException.ranOutOfLinks(), is(true)); // should return bigger relevance values first assertThat(selectedLink1.getURL(), is(link3.getURL())); assertThat(selectedLink2.getURL(), is(link2.getURL())); assertThat(selectedLink3.getURL(), is(link1.getURL())); frontierManager.close(); } @Test public void shouldNotReturnAgainALinkThatWasAlreadyReturned() throws Exception { // given LinkSelector linkSelector = new TopkLinkSelector(); FrontierManager frontierManager = new FrontierManager(frontier, dataPath, downloadRobots, 2, 2, minimumAccessTimeInterval , linkSelector, emptyLinkFilter, metricsManager); LinkRelevance link1 = new LinkRelevance(new URL("http://www.example1.com/index.html"), 1); LinkRelevance link2 = new LinkRelevance(new URL("http://www.example2.com/index.html"), 2); // when frontierManager.insert(link1); frontierManager.insert(link2); LinkRelevance selectedLink1 = frontierManager.nextURL(); LinkRelevance selectedLink2 = frontierManager.nextURL(); DataNotFoundException notFoundException1 = null; try { frontierManager.nextURL(); } catch(DataNotFoundException e) { notFoundException1 = e; } frontierManager.insert(link1); // insert link 1 again, should not be returned DataNotFoundException notFoundException2 = null; try { frontierManager.nextURL(); } catch(DataNotFoundException e) { notFoundException2 = e; } // then assertThat(selectedLink1, is(notNullValue())); assertThat(selectedLink2, is(notNullValue())); assertThat(notFoundException1, is(notNullValue())); assertThat(notFoundException1.ranOutOfLinks(), is(true)); assertThat(notFoundException2, is(notNullValue())); assertThat(notFoundException2.ranOutOfLinks(), is(true)); frontierManager.close(); } @Test public void shouldNotReturnLinkReturnedWithinMinimumTimeInterval() throws Exception { // given int minimumAccessTimeInterval = 500; LinkSelector linkSelector = new TopkLinkSelector(); FrontierManager frontierManager = new FrontierManager(frontier, dataPath, downloadRobots, 2, 2, minimumAccessTimeInterval , linkSelector, emptyLinkFilter, metricsManager); LinkRelevance link1 = new LinkRelevance(new URL("http://www.example1.com/index1.html"), 1); LinkRelevance link2 = new LinkRelevance(new URL("http://www.example1.com/index2.html"), 2); frontierManager.insert(link1); frontierManager.insert(link2); // when LinkRelevance selectedLink1 = frontierManager.nextURL(); DataNotFoundException notFoundException1 = null; try { frontierManager.nextURL(); } catch(DataNotFoundException e) { notFoundException1 = e; } // should return after minimum time interval Thread.sleep(minimumAccessTimeInterval+10); LinkRelevance selectedLink2 = frontierManager.nextURL(); // then assertThat(selectedLink1, is(notNullValue())); assertThat(selectedLink1.getURL().toString(), is(link2.getURL().toString())); assertThat(notFoundException1, is(notNullValue())); assertThat(notFoundException1.ranOutOfLinks(), is(false)); assertThat(selectedLink2, is(notNullValue())); assertThat(selectedLink2, is(notNullValue())); frontierManager.close(); } }