package focusedCrawler.crawler.async; import static org.hamcrest.CoreMatchers.is; import static org.hamcrest.Matchers.notNullValue; import static org.junit.Assert.assertThat; import java.net.URL; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import org.apache.tika.metadata.Metadata; import org.junit.Test; import focusedCrawler.crawler.crawlercommons.fetcher.FetchedResult; import focusedCrawler.link.frontier.LinkRelevance; import focusedCrawler.util.CommunicationException; import focusedCrawler.util.storage.StorageDefault; import focusedCrawler.util.storage.StorageException; public class RobotsTxtHandlerTest { static class LinkStorageMock extends StorageDefault { public RobotsTxtHandler.RobotsData robotsData = null; @Override public synchronized Object insert(Object obj) throws StorageException, CommunicationException { if(obj instanceof RobotsTxtHandler.RobotsData) { this.robotsData = (RobotsTxtHandler.RobotsData) obj; } return null; } }; @Test public void shouldParseLinksFromSitemapXml() throws Exception { // given LinkStorageMock linkStorageMock = new LinkStorageMock(); RobotsTxtHandler handler = new RobotsTxtHandler(linkStorageMock, "TestAgent"); String url = "http://www.example.com/robots.txt"; Path robotsFilePath = Paths.get(RobotsTxtHandler.class.getResource("sample-robots.txt").toURI()); byte[] robotsContent = Files.readAllBytes(robotsFilePath); FetchedResult response = new FetchedResult(url, url, 1, new Metadata(), robotsContent, "text/plain", 1, null, url, 0, "127.0.0.1", 200, "OK"); LinkRelevance link = new LinkRelevance(new URL(url), 1, LinkRelevance.Type.ROBOTS); // when handler.completed(link , response); // then assertThat(linkStorageMock.robotsData, is(notNullValue())); assertThat(linkStorageMock.robotsData.sitemapUrls.size(), is(2)); assertThat(linkStorageMock.robotsData.sitemapUrls.get(0), is("http://www.example.com/example-sitemap/sitemap.xml")); assertThat(linkStorageMock.robotsData.sitemapUrls.get(1), is("http://www.example.com/example-sitemap/sitemap-news.xml")); } }