package focusedCrawler.integration; import static java.util.Arrays.asList; import static org.hamcrest.CoreMatchers.is; import static org.hamcrest.CoreMatchers.nullValue; import static org.hamcrest.Matchers.lessThan; import static org.junit.Assert.assertThat; import java.io.IOException; import java.nio.file.Paths; import java.util.List; import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Rule; import org.junit.Test; import org.junit.rules.TemporaryFolder; import com.sun.net.httpserver.HttpServer; import focusedCrawler.Main; import focusedCrawler.config.ConfigService; import focusedCrawler.crawler.async.TestWebServerBuilder; import focusedCrawler.link.frontier.Frontier; import focusedCrawler.link.frontier.LinkRelevance; public class HardFocusCrawlingTest { @Rule public TemporaryFolder tempFolder = new TemporaryFolder(); static String basePath = HardFocusCrawlingTest.class.getResource("hard_focus_test").getFile(); private static HttpServer httpServer; @BeforeClass public static void setupServer() throws IOException, InterruptedException { System.out.println("HardFocusCrawlingTest"); httpServer = new TestWebServerBuilder("127.0.0.1", 1234) .withStaticFolder(Paths.get(basePath, "html")) .start(); } @AfterClass public static void shutdownServer() throws IOException { httpServer.stop(0); } @Test public void shouldDownloadLinksOnlyFromRelevantPages() throws Exception { String outputPath = tempFolder.newFolder().toString(); String configPath = basePath + "/config/"; String seedPath = basePath + "/seeds.txt"; String modelPath = basePath + "/model/"; // when String[] args = { "startCrawl", "-c", configPath, "-m", modelPath, "-o", outputPath, "-s", seedPath }; Main.main(args); // then ConfigService config = new ConfigService(configPath + "/ache.yml"); String linkDirectory = config.getLinkStorageConfig().getLinkDirectory(); String dir = Paths.get(outputPath, linkDirectory).toString(); Frontier frontier = new Frontier(dir, 1000); List<String> shouldBeDownloaded = asList( "index.html", "index_irrelevant.html", "index_relevant.html", "relevant_page1.html", "irrelevant_page1.html" ); List<String> shouldNOTBeDownloaded = asList( "relevant_page2.html", "irrelevant_page2.html" ); for (String url : shouldBeDownloaded) { LinkRelevance link = LinkRelevance.create("http://127.0.0.1:1234/" + url); assertThat(frontier.exist(link), is(lessThan(0))); } for (String url : shouldNOTBeDownloaded) { LinkRelevance link = LinkRelevance.create("http://127.0.0.1:1234/" + url); assertThat(frontier.exist(link), is(nullValue())); } } }