package io.monokkel.actors; import akka.actor.Props; import com.google.common.collect.ImmutableList; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.common.collect.Sets; import io.monokkel.actors.domain.ThrottlerConfig; import io.monokkel.core.*; import io.monokkel.domain.PageData; import io.monokkel.domain.UrlVisitResponse; import io.monokkel.exceptions.IndexDocumentException; import io.monokkel.exceptions.ValidationError; import io.monokkel.messages.ParserDone; import io.monokkel.messages.SeedUrl; import org.junit.Before; import org.junit.Test; import java.util.Map; import static java.lang.Thread.sleep; import static org.mockito.Mockito.*; public class CrawlerActorTest extends ActorTest { private UrlVisitor mockedUrlVisitor; private HtmlTransformer mockedHtmlTransformer; private Indexer mockedIndexer; private DataCleaner mockedDataCleaner; private ThrottlerConfig throttlerConfig; private DataValidator mockedValidator; private JsonTransformer mockedJsonTransformer; @Before public void setUp() throws Exception { mockedUrlVisitor = mock(UrlVisitor.class); mockedHtmlTransformer = mock(HtmlTransformer.class); mockedIndexer = mock(Indexer.class); mockedDataCleaner = mock(DataCleaner.class); mockedValidator = mock(DataValidator.class); mockedJsonTransformer = mock(JsonTransformer.class); throttlerConfig = new ThrottlerConfig(1,2, true); super.before(); } @Test public void onReceive_withOneSeedUrl_expectUrlToBeVisited() throws Exception { String url = "http://url"; when(mockedUrlVisitor.visitUrlAndRespond(url)).thenReturn(createUrlVisitResponse(url)); subject.tell(new SeedUrl(1, url), javaTestKit.getRef()); sleep(1000L); verify(mockedUrlVisitor, times(1)).visitUrlAndRespond(url); } @Test public void onReceive_withBasicResponse_expectParserToBeCalled() throws Exception { final String url = "http://url"; final String response = "<html></html>"; final UrlVisitResponse urlVisitResponse = createUrlVisitResponse(url, response); when(mockedUrlVisitor.visitUrlAndRespond(url)).thenReturn(urlVisitResponse); when(mockedHtmlTransformer.shouldParse(url, response, urlVisitResponse.getTypesFromTheResponseHeader())).thenReturn(true); subject.tell(new SeedUrl(1, url), javaTestKit.getRef()); sleep(1000L); verify(mockedHtmlTransformer, times(1)).parse(url, response, 1L); } @Test public void onReceive_withJSONBasicResponse_expectJSONParserToBeCalled() throws Exception { final String url = "http://url"; final String response = "{\"data\":\"dataContent\"}"; final UrlVisitResponse urlVisitResponse = createUrlVisitResponse(url, response); when(mockedUrlVisitor.visitUrlAndRespond(url)).thenReturn(urlVisitResponse); when(mockedJsonTransformer.shouldParse(url, response, urlVisitResponse.getTypesFromTheResponseHeader())).thenReturn(true); subject.tell(new SeedUrl(1, url), javaTestKit.getRef()); sleep(1000L); verify(mockedJsonTransformer, times(1)).parse(url, response, 1L); } @Test public void onReceive_withJSONBasicResponseAndNotParsables_expectJSONToNotBeCalled() throws Exception { final String url = "http://url"; final String response = "{\"data\":\"dataContent\"}"; final UrlVisitResponse urlVisitResponse = createUrlVisitResponse(url, response); when(mockedUrlVisitor.visitUrlAndRespond(url)).thenReturn(urlVisitResponse); when(mockedJsonTransformer.shouldParse(url, response, urlVisitResponse.getTypesFromTheResponseHeader())).thenReturn(false); subject.tell(new SeedUrl(1, url), javaTestKit.getRef()); sleep(1000L); verify(mockedJsonTransformer, never()).parse(url, response, 1L); } @Test public void onReceive_withWithSeedUrl_expectUrlToBeVisited() throws Exception { final String url = "http://url"; final String response = "<html></html>"; final UrlVisitResponse urlVisitResponse = createUrlVisitResponse(url, response); when(mockedUrlVisitor.visitUrlAndRespond(url)).thenReturn(urlVisitResponse); Map<String, String> transformedMap = Maps.newHashMap(); PageData pageData = new PageData(Sets.newHashSet(""), url, 1L, response, response, "title", transformedMap); when(mockedHtmlTransformer.shouldParse(url,response,urlVisitResponse.getTypesFromTheResponseHeader())).thenReturn(true); when(mockedHtmlTransformer.parse(url, response, 1L)).thenReturn(pageData); when(mockedDataCleaner.clean(pageData)).thenReturn(pageData); subject.tell(new SeedUrl(1, url), javaTestKit.getRef()); sleep(1000L); verify(mockedIndexer, times(1)).indexParserOutput(pageData); } @Test public void onReceive_withWithSeedUrlButFailedValidation_expectIndexerNeverToBeCalled() throws Exception { final String url = "http://url"; final String response = "<html></html>"; when(mockedUrlVisitor.visitUrlAndRespond(url)).thenReturn(createUrlVisitResponse(url,response)); Map<String, String> transformedMap = Maps.newHashMap(); PageData pageData = new PageData(Sets.newHashSet(""), url, 1L, response, response, "title", transformedMap); when(mockedHtmlTransformer.parse(url, response, 1L)).thenReturn(pageData); when(mockedDataCleaner.clean(pageData)).thenReturn(pageData); when(mockedValidator.validate(pageData)).thenThrow(new ValidationError("field", "broke", "expression")); subject.tell(new SeedUrl(1, url), javaTestKit.getRef()); sleep(1000L); verify(mockedIndexer, never()).indexParserOutput(pageData); } @Test public void onReceive_withWithSeedUrl_expectParseDoneToBeSent() throws Exception, IndexDocumentException { final String urlRetrievedFromHtml = "http://htmlurl"; PageData pageData = new PageData(Sets.newHashSet(urlRetrievedFromHtml), "http://seedurl5", 1L, "<html></html>", "title", "extractedContent",null); subject.tell(new ParserDone(pageData, 0, 1), javaTestKit.getRef()); sleep(1000L); verify(mockedUrlVisitor, times(1)).visitUrlAndRespond(urlRetrievedFromHtml); } private UrlVisitResponse createUrlVisitResponse(String url,String response) { return new UrlVisitResponse(response, 200, ImmutableList.of("text/html"), 1L, url); } private UrlVisitResponse createUrlVisitResponse(String url) { return createUrlVisitResponse(url,"<html><body></body></html>"); } @Override protected String getActorName() { return "crawlerActor"; } @Override protected Props getActor() { return CrawlerActor.props(mockedUrlVisitor, mockedIndexer, mockedDataCleaner, throttlerConfig, Lists.newArrayList(mockedHtmlTransformer, mockedJsonTransformer), mockedValidator); } }