package com.constellio.app.modules.es.connectors.http.utils; import static com.constellio.data.conf.HashingEncoding.BASE64; import static org.assertj.core.api.Assertions.assertThat; import org.eclipse.jetty.server.Server; import org.junit.After; import org.junit.Before; import org.junit.Test; import com.constellio.app.modules.es.connectors.http.ConnectorHttpDocumentFetchException; import com.constellio.app.modules.es.connectors.http.fetcher.HttpURLFetchingService; import com.constellio.app.modules.es.connectors.http.fetcher.UrlAcceptor; import com.constellio.app.modules.es.connectors.http.utils.HtmlPageParser.HtmlPageParserResults; import com.constellio.data.utils.hashing.HashingService; import com.constellio.model.services.parser.FileParser; import com.constellio.sdk.tests.ConstellioTest; import com.gargoylesoftware.htmlunit.html.HtmlPage; public class HtmlPageParserAcceptTest extends ConstellioTest { private static String WEBSITE = "http://localhost:4242/"; Server server; HttpURLFetchingService fetchingService = new HttpURLFetchingService(1000); UrlAcceptor acceptAllExceptElephant = new UrlAcceptor() { @Override public boolean isAccepted(String normalizedUrl) { return !normalizedUrl.contains("elephant"); } }; UrlAcceptor acceptAll = new UrlAcceptor() { @Override public boolean isAccepted(String normalizedUrl) { return true; } }; FileParser fileParser; HashingService hashingService; HtmlPageParser parser; @Before public void setUp() throws Exception { fileParser = getModelLayerFactory().newFileParser(); hashingService = getModelLayerFactory().getIOServicesFactory().newHashingService(BASE64); parser = new HtmlPageParser(acceptAll, fileParser, hashingService); server = WebsitesUtils.startWebsiteInState1(); } @After public void tearDown() throws Exception { if (server != null) { server.stop(); } } @Test public void whenParsingHtmlPageThenExtractUrlsTitleAndParsedContent() throws Exception { HtmlPageParserResults results = parse(WEBSITE + "singes.html"); assertThat(results.getTitle()).isEqualTo("La famille des singes"); assertThat(results.getParsedContent()).contains("espèces").contains("animaux").contains("éléphants"); assertThat(results.getLinkedUrls()).containsOnly( WEBSITE + "singes/macaque.html", WEBSITE + "singes/gorille.html", WEBSITE + "girafe.html", WEBSITE + "elephant.html" ); } @Test public void givenAnUrlIsNotAcceptedThenNotReturnedInUrlList() throws Exception { parser = new HtmlPageParser(acceptAllExceptElephant, fileParser, hashingService); HtmlPageParserResults results = parse(WEBSITE + "singes.html"); assertThat(results.getTitle()).isEqualTo("La famille des singes"); assertThat(results.getParsedContent()).contains("espèces").contains("animaux").contains("éléphants"); assertThat(results.getLinkedUrls()).containsOnly( WEBSITE + "singes/macaque.html", WEBSITE + "singes/gorille.html", WEBSITE + "girafe.html" ); } private HtmlPageParserResults parse(String url) throws ConnectorHttpDocumentFetchException { HtmlPage page = (HtmlPage) fetchingService.fetch(url); return parser.parse(url, page); } }