package io.monokkel; import com.google.common.collect.ImmutableList; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.common.collect.Sets; import io.monokkel.configuration.CrawlerConfiguration; import io.monokkel.configuration.CrawlerConfigurationBuilder; import io.monokkel.configuration.CrawlerWireObject; import io.monokkel.core.*; import io.monokkel.domain.PageData; import io.monokkel.domain.UrlVisitResponse; import io.monokkel.exceptions.FatalFault; import io.monokkel.exceptions.IndexDocumentException; import io.monokkel.exceptions.ParseException; import io.monokkel.exceptions.UrlVisitException; import io.monokkel.factories.HttpClient4Builder; import org.apache.commons.lang.mutable.MutableInt; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.junit.Before; import org.junit.Test; import org.mockito.Mockito; import java.io.IOException; import java.util.ArrayList; import java.util.Map; import java.util.concurrent.TimeUnit; import static java.util.concurrent.Executors.newScheduledThreadPool; import static org.junit.Assert.assertEquals; import static org.mockito.Mockito.*; /** * Created by tarjei on 10/06/14. */ public class CrawlerTest { private UrlVisitor mockedUrlVisitor; private HtmlTransformer mockedHtmlTransformer; private Indexer mockedIndexer; private CrawlerWireObject crawlerWireObject; private Map<String, String> transformationMap; private DataCleaner mockedDataCleaner; private DataValidator mockedDataValidator; private JsonTransformer mockJsonTransformer; @Before public void setup() { mockedUrlVisitor = mock(UrlVisitor.class); mockedHtmlTransformer = mock(HtmlTransformer.class); mockedIndexer = mock(Indexer.class); mockedDataCleaner = mock(DataCleaner.class); transformationMap = Maps.newHashMap(); mockedDataValidator = mock(DataValidator.class); mockJsonTransformer = mock(JsonTransformer.class); crawlerWireObject = new CrawlerWireObject(mockedUrlVisitor, mockedHtmlTransformer, mockedIndexer, mockedDataCleaner, mockedDataValidator, mockJsonTransformer); } @Test public void executeCrawler_withMockedVisitor_expectErrorHandlingToReboot() throws FatalFault, UrlVisitException, ParseException { final String response = "<html><body><a href=\"http://localhost:49291?request=first\" />yes yes yes</body></html>"; final long timeStamp = 1L; final String successUrl = "http://seedurl2"; Crawler crawler = new Crawler(); String transformationPath = ""; CrawlerConfiguration crawlerConfiguration = new CrawlerConfigurationBuilder().setFilters(Lists.newArrayList()).setIndexName("").setIndexType("").setIndexNodeHost("").setIndexNodePort(0).setMaxDuration(1L).setSeedUrls(Lists.newArrayList("http://seedurl1", successUrl)).setContentRetrievalExpression("").setAttributeToLocateContent("").setCreateEmbeddedElasticNode(false).setMaxDepth(1).setTransformationPath(transformationPath).setCleanDataFilePath("").setSeconds(1).setUrlsPer(1).setDisableThrottle(true).createCrawlerConfiguration(); UrlVisitResponse urlVisitResponse = createUrlVisitResponse(successUrl,response,timeStamp); when(mockedHtmlTransformer.shouldParse(successUrl,response,urlVisitResponse.getTypesFromTheResponseHeader())).thenReturn(true); when(mockedUrlVisitor.visitUrlAndRespond(Mockito.anyString())).thenThrow(new UrlVisitException("", null)).thenReturn(urlVisitResponse); newScheduledThreadPool(1).schedule(crawler::killCrawler, 2, TimeUnit.SECONDS); crawler.executeCrawler(crawlerConfiguration, crawlerWireObject); verify(mockedHtmlTransformer, times(1)).parse(successUrl, response, timeStamp); } @Test public void executeCrawler_withMockedVisitor_expectVisitorToBeCalled() throws FatalFault, UrlVisitException, ParseException { String seedUrl = "http://seedurl"; final String response = "<html><body><a href=\"http://localhost:49291?request=first\" />yes yes yes</body></html>"; final long timeStamp = 1L; String nextUrl = ""; Crawler crawler = new Crawler(); CrawlerConfiguration crawlerConfiguration = buildCrawlerConfiguration(seedUrl); UrlVisitResponse urlVisitResponse = createUrlVisitResponse(seedUrl,response,timeStamp); PageData pageData = new PageData(Sets.newHashSet(nextUrl), seedUrl, timeStamp, response, response, "title", transformationMap); setupMocks(seedUrl, response, timeStamp, mockedUrlVisitor, mockedHtmlTransformer, urlVisitResponse, pageData, mockedDataCleaner); newScheduledThreadPool(1).schedule(crawler::killCrawler, 2, TimeUnit.SECONDS); crawler.executeCrawler(crawlerConfiguration, crawlerWireObject); verify(mockedUrlVisitor, times(1)).visitUrlAndRespond(seedUrl); } private void setupMocks(String seedUrl, String response, long timeStamp, UrlVisitor urlVisitor, HtmlTransformer htmlTransformer, UrlVisitResponse urlVisitResponse, PageData pageData, DataCleaner mockedDataCleaner) throws UrlVisitException, ParseException { when(urlVisitor.visitUrlAndRespond(seedUrl)).thenReturn(urlVisitResponse); when(htmlTransformer.parse(seedUrl, response, timeStamp)).thenReturn(pageData); when(htmlTransformer.shouldParse(seedUrl,response,urlVisitResponse.getTypesFromTheResponseHeader())).thenReturn(true); when(mockedDataCleaner.clean(pageData)).thenReturn(pageData); } @Test public void executeCrawler_withMockedVisitor_expectParserToBeCalled() throws FatalFault, UrlVisitException, ParseException { String seedUrl = "http://seedurl"; final String response = "<html><body><a href=\"http://localhost:49291?request=first\" />yes yes yes</body></html>"; final long timeStamp = 1L; String nextUrl = ""; Crawler crawler = new Crawler(); CrawlerConfiguration crawlerConfiguration = buildCrawlerConfiguration(seedUrl); UrlVisitResponse urlVisitResponse = createUrlVisitResponse(seedUrl,response,timeStamp); PageData pageData = new PageData(Sets.newHashSet(nextUrl), seedUrl, timeStamp, response, response, "title", transformationMap); setupMocks(seedUrl, response, timeStamp, mockedUrlVisitor, mockedHtmlTransformer, urlVisitResponse, pageData, mockedDataCleaner); newScheduledThreadPool(1).schedule(crawler::killCrawler, 2, TimeUnit.SECONDS); crawler.executeCrawler(crawlerConfiguration, crawlerWireObject); verify(mockedHtmlTransformer, times(1)).parse(seedUrl, response, timeStamp); } @Test public void executeCrawler_withMockedVisitor_expectPageDataToBeCalled() throws FatalFault, UrlVisitException, ParseException, IndexDocumentException { String seedUrl = "http://seedurl"; final String response = "<html><body><a href=\"http://localhost:49291?request=first\" />yes yes yes</body></html>"; final long timeStamp = 1L; String nextUrl = ""; Crawler crawler = new Crawler(); CrawlerConfiguration crawlerConfiguration = buildCrawlerConfiguration(seedUrl); PageData pageData = new PageData(Sets.newHashSet(nextUrl), seedUrl, timeStamp, response, response, "title", transformationMap); UrlVisitResponse urlVisitResponse = createUrlVisitResponse(seedUrl,response,timeStamp); setupMocks(seedUrl, response, timeStamp, mockedUrlVisitor, mockedHtmlTransformer, urlVisitResponse, pageData, mockedDataCleaner); newScheduledThreadPool(1).schedule(crawler::killCrawler, 2, TimeUnit.SECONDS); crawler.executeCrawler(crawlerConfiguration, crawlerWireObject); verify(mockedDataCleaner, times(1)).clean(pageData); } @Test public void executeCrawler_withMockedVisitor_expectIndexerToBeCalled() throws FatalFault, UrlVisitException, ParseException, IndexDocumentException { String seedUrl = "http://seedurl"; final String response = "<html><body><a href=\"http://localhost:49291?request=first\" />yes yes yes</body></html>"; final long timeStamp = 1L; String nextUrl = ""; Crawler crawler = new Crawler(); CrawlerConfiguration crawlerConfiguration = buildCrawlerConfiguration(seedUrl); PageData pageData = new PageData(Sets.newHashSet(nextUrl), seedUrl, timeStamp, response, response, "title", transformationMap); UrlVisitResponse urlVisitResponse = createUrlVisitResponse(seedUrl,response,timeStamp); setupMocks(seedUrl, response, timeStamp, mockedUrlVisitor, mockedHtmlTransformer, urlVisitResponse, pageData, mockedDataCleaner); newScheduledThreadPool(1).schedule(crawler::killCrawler, 2, TimeUnit.SECONDS); crawler.executeCrawler(crawlerConfiguration, crawlerWireObject); verify(mockedIndexer, times(1)).indexParserOutput(pageData); } @Test public void executeCrawler_withSetupOfAnEmbeddedElasticSearchNode_expectNodeHttpToReturnStatusCode200After20seconds() throws FatalFault, UrlVisitException, ParseException, IndexDocumentException, InterruptedException, IOException { final int expectedStatusCode = 200; Crawler crawler = new Crawler(); CrawlerConfiguration crawlerConfiguration = setupCrawler(); // The async task for query the REST API found at the ElasticNode final MutableInt actualStatusCode = new MutableInt(); Runnable askIndexIfIsAlive = () -> { HttpClient4Builder httpClient4Builder = new HttpClient4Builder(); final CloseableHttpClient closeableHttpClient = httpClient4Builder.buildClientBuilder(); HttpGet target = new HttpGet("http://localhost:49231/"); try { Integer statusCode = closeableHttpClient.execute(target, response1 -> response1.getStatusLine().getStatusCode()); actualStatusCode.setValue(statusCode); } catch (IOException e) { e.printStackTrace(); } }; newScheduledThreadPool(1).schedule(crawler::killCrawler, 20, TimeUnit.SECONDS); newScheduledThreadPool(1).schedule(askIndexIfIsAlive, 10, TimeUnit.SECONDS); crawler.executeCrawler(crawlerConfiguration, crawlerWireObject); assertEquals(expectedStatusCode, actualStatusCode.intValue()); } private CrawlerConfiguration setupCrawler() throws UrlVisitException, ParseException { String seedUrl = "http://seedurl"; CrawlerConfiguration crawlerConfiguration = buildCrawlerConfiguration(seedUrl, true); String nextUrl = ""; final long timeStamp = 1L; final String response = "<html><body><a href=\"http://localhost:49291?request=first\" />yes yes yes</body></html>"; PageData pageData = new PageData(Sets.newHashSet(nextUrl), seedUrl, timeStamp, response, response, "title", transformationMap); UrlVisitResponse urlVisitResponse = createUrlVisitResponse(seedUrl,response); setupMocks(seedUrl, response, timeStamp, mockedUrlVisitor, mockedHtmlTransformer, urlVisitResponse, pageData, mockedDataCleaner); return crawlerConfiguration; } private UrlVisitResponse createUrlVisitResponse(String url,String response,Long timestamp) { return new UrlVisitResponse(response, 200, ImmutableList.of("text/html"), timestamp, url); } private UrlVisitResponse createUrlVisitResponse(String url,String response) { return createUrlVisitResponse(url,response,1L); } private UrlVisitResponse createUrlVisitResponse(String url) { return createUrlVisitResponse(url,"<html><body></body></html>"); } private CrawlerConfiguration buildCrawlerConfiguration(final String seedUrl) { return buildCrawlerConfiguration(seedUrl, false); } private CrawlerConfiguration buildCrawlerConfiguration(final String seedUrl, final Boolean startElasticNode) { final String transformationPath = ""; final String cleanDataFilePath = ""; final Integer seconds = 1; final Integer urlsPer = 3; final ArrayList<String> filters = Lists.newArrayList(); return new CrawlerConfigurationBuilder().setFilters(filters).setIndexName("").setIndexType("").setIndexNodeHost("").setIndexNodePort(0).setMaxDuration(1L).setSeedUrls(Lists.newArrayList(seedUrl)).setContentRetrievalExpression("").setAttributeToLocateContent("").setCreateEmbeddedElasticNode(startElasticNode).setMaxDepth(1).setTransformationPath(transformationPath).setCleanDataFilePath(cleanDataFilePath).setSeconds(seconds).setUrlsPer(urlsPer).setDisableThrottle(true).createCrawlerConfiguration(); } }