package io.monokkel;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import io.monokkel.configuration.CrawlerConfiguration;
import io.monokkel.configuration.CrawlerConfigurationBuilder;
import io.monokkel.configuration.CrawlerWireObject;
import io.monokkel.core.*;
import io.monokkel.domain.PageData;
import io.monokkel.domain.UrlVisitResponse;
import io.monokkel.exceptions.FatalFault;
import io.monokkel.exceptions.IndexDocumentException;
import io.monokkel.exceptions.ParseException;
import io.monokkel.exceptions.UrlVisitException;
import io.monokkel.factories.HttpClient4Builder;
import org.apache.commons.lang.mutable.MutableInt;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.junit.Before;
import org.junit.Test;
import org.mockito.Mockito;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import static java.util.concurrent.Executors.newScheduledThreadPool;
import static org.junit.Assert.assertEquals;
import static org.mockito.Mockito.*;
/**
* Created by tarjei on 10/06/14.
*/
public class CrawlerTest {
private UrlVisitor mockedUrlVisitor;
private HtmlTransformer mockedHtmlTransformer;
private Indexer mockedIndexer;
private CrawlerWireObject crawlerWireObject;
private Map<String, String> transformationMap;
private DataCleaner mockedDataCleaner;
private DataValidator mockedDataValidator;
private JsonTransformer mockJsonTransformer;
@Before
public void setup() {
mockedUrlVisitor = mock(UrlVisitor.class);
mockedHtmlTransformer = mock(HtmlTransformer.class);
mockedIndexer = mock(Indexer.class);
mockedDataCleaner = mock(DataCleaner.class);
transformationMap = Maps.newHashMap();
mockedDataValidator = mock(DataValidator.class);
mockJsonTransformer = mock(JsonTransformer.class);
crawlerWireObject = new CrawlerWireObject(mockedUrlVisitor, mockedHtmlTransformer, mockedIndexer, mockedDataCleaner, mockedDataValidator, mockJsonTransformer);
}
@Test
public void executeCrawler_withMockedVisitor_expectErrorHandlingToReboot() throws FatalFault, UrlVisitException, ParseException {
final String response = "<html><body><a href=\"http://localhost:49291?request=first\" />yes yes yes</body></html>";
final long timeStamp = 1L;
final String successUrl = "http://seedurl2";
Crawler crawler = new Crawler();
String transformationPath = "";
CrawlerConfiguration crawlerConfiguration = new CrawlerConfigurationBuilder().setFilters(Lists.newArrayList()).setIndexName("").setIndexType("").setIndexNodeHost("").setIndexNodePort(0).setMaxDuration(1L).setSeedUrls(Lists.newArrayList("http://seedurl1", successUrl)).setContentRetrievalExpression("").setAttributeToLocateContent("").setCreateEmbeddedElasticNode(false).setMaxDepth(1).setTransformationPath(transformationPath).setCleanDataFilePath("").setSeconds(1).setUrlsPer(1).setDisableThrottle(true).createCrawlerConfiguration();
UrlVisitResponse urlVisitResponse = createUrlVisitResponse(successUrl,response,timeStamp);
when(mockedHtmlTransformer.shouldParse(successUrl,response,urlVisitResponse.getTypesFromTheResponseHeader())).thenReturn(true);
when(mockedUrlVisitor.visitUrlAndRespond(Mockito.anyString())).thenThrow(new UrlVisitException("", null)).thenReturn(urlVisitResponse);
newScheduledThreadPool(1).schedule(crawler::killCrawler, 2, TimeUnit.SECONDS);
crawler.executeCrawler(crawlerConfiguration, crawlerWireObject);
verify(mockedHtmlTransformer, times(1)).parse(successUrl, response, timeStamp);
}
@Test
public void executeCrawler_withMockedVisitor_expectVisitorToBeCalled() throws FatalFault, UrlVisitException, ParseException {
String seedUrl = "http://seedurl";
final String response = "<html><body><a href=\"http://localhost:49291?request=first\" />yes yes yes</body></html>";
final long timeStamp = 1L;
String nextUrl = "";
Crawler crawler = new Crawler();
CrawlerConfiguration crawlerConfiguration = buildCrawlerConfiguration(seedUrl);
UrlVisitResponse urlVisitResponse = createUrlVisitResponse(seedUrl,response,timeStamp);
PageData pageData = new PageData(Sets.newHashSet(nextUrl), seedUrl, timeStamp, response, response, "title", transformationMap);
setupMocks(seedUrl, response, timeStamp, mockedUrlVisitor, mockedHtmlTransformer, urlVisitResponse, pageData, mockedDataCleaner);
newScheduledThreadPool(1).schedule(crawler::killCrawler, 2, TimeUnit.SECONDS);
crawler.executeCrawler(crawlerConfiguration, crawlerWireObject);
verify(mockedUrlVisitor, times(1)).visitUrlAndRespond(seedUrl);
}
private void setupMocks(String seedUrl, String response, long timeStamp, UrlVisitor urlVisitor, HtmlTransformer htmlTransformer, UrlVisitResponse urlVisitResponse, PageData pageData, DataCleaner mockedDataCleaner) throws UrlVisitException, ParseException {
when(urlVisitor.visitUrlAndRespond(seedUrl)).thenReturn(urlVisitResponse);
when(htmlTransformer.parse(seedUrl, response, timeStamp)).thenReturn(pageData);
when(htmlTransformer.shouldParse(seedUrl,response,urlVisitResponse.getTypesFromTheResponseHeader())).thenReturn(true);
when(mockedDataCleaner.clean(pageData)).thenReturn(pageData);
}
@Test
public void executeCrawler_withMockedVisitor_expectParserToBeCalled() throws FatalFault, UrlVisitException, ParseException {
String seedUrl = "http://seedurl";
final String response = "<html><body><a href=\"http://localhost:49291?request=first\" />yes yes yes</body></html>";
final long timeStamp = 1L;
String nextUrl = "";
Crawler crawler = new Crawler();
CrawlerConfiguration crawlerConfiguration = buildCrawlerConfiguration(seedUrl);
UrlVisitResponse urlVisitResponse = createUrlVisitResponse(seedUrl,response,timeStamp);
PageData pageData = new PageData(Sets.newHashSet(nextUrl), seedUrl, timeStamp, response, response, "title", transformationMap);
setupMocks(seedUrl, response, timeStamp, mockedUrlVisitor, mockedHtmlTransformer, urlVisitResponse, pageData, mockedDataCleaner);
newScheduledThreadPool(1).schedule(crawler::killCrawler, 2, TimeUnit.SECONDS);
crawler.executeCrawler(crawlerConfiguration, crawlerWireObject);
verify(mockedHtmlTransformer, times(1)).parse(seedUrl, response, timeStamp);
}
@Test
public void executeCrawler_withMockedVisitor_expectPageDataToBeCalled() throws FatalFault, UrlVisitException, ParseException, IndexDocumentException {
String seedUrl = "http://seedurl";
final String response = "<html><body><a href=\"http://localhost:49291?request=first\" />yes yes yes</body></html>";
final long timeStamp = 1L;
String nextUrl = "";
Crawler crawler = new Crawler();
CrawlerConfiguration crawlerConfiguration = buildCrawlerConfiguration(seedUrl);
PageData pageData = new PageData(Sets.newHashSet(nextUrl), seedUrl, timeStamp, response, response, "title", transformationMap);
UrlVisitResponse urlVisitResponse = createUrlVisitResponse(seedUrl,response,timeStamp);
setupMocks(seedUrl, response, timeStamp, mockedUrlVisitor, mockedHtmlTransformer, urlVisitResponse, pageData, mockedDataCleaner);
newScheduledThreadPool(1).schedule(crawler::killCrawler, 2, TimeUnit.SECONDS);
crawler.executeCrawler(crawlerConfiguration, crawlerWireObject);
verify(mockedDataCleaner, times(1)).clean(pageData);
}
@Test
public void executeCrawler_withMockedVisitor_expectIndexerToBeCalled() throws FatalFault, UrlVisitException, ParseException, IndexDocumentException {
String seedUrl = "http://seedurl";
final String response = "<html><body><a href=\"http://localhost:49291?request=first\" />yes yes yes</body></html>";
final long timeStamp = 1L;
String nextUrl = "";
Crawler crawler = new Crawler();
CrawlerConfiguration crawlerConfiguration = buildCrawlerConfiguration(seedUrl);
PageData pageData = new PageData(Sets.newHashSet(nextUrl), seedUrl, timeStamp, response, response, "title", transformationMap);
UrlVisitResponse urlVisitResponse = createUrlVisitResponse(seedUrl,response,timeStamp);
setupMocks(seedUrl, response, timeStamp, mockedUrlVisitor, mockedHtmlTransformer, urlVisitResponse, pageData, mockedDataCleaner);
newScheduledThreadPool(1).schedule(crawler::killCrawler, 2, TimeUnit.SECONDS);
crawler.executeCrawler(crawlerConfiguration, crawlerWireObject);
verify(mockedIndexer, times(1)).indexParserOutput(pageData);
}
@Test
public void executeCrawler_withSetupOfAnEmbeddedElasticSearchNode_expectNodeHttpToReturnStatusCode200After20seconds() throws FatalFault, UrlVisitException, ParseException, IndexDocumentException, InterruptedException, IOException {
final int expectedStatusCode = 200;
Crawler crawler = new Crawler();
CrawlerConfiguration crawlerConfiguration = setupCrawler();
// The async task for query the REST API found at the ElasticNode
final MutableInt actualStatusCode = new MutableInt();
Runnable askIndexIfIsAlive = () -> {
HttpClient4Builder httpClient4Builder = new HttpClient4Builder();
final CloseableHttpClient closeableHttpClient = httpClient4Builder.buildClientBuilder();
HttpGet target = new HttpGet("http://localhost:49231/");
try {
Integer statusCode = closeableHttpClient.execute(target, response1 -> response1.getStatusLine().getStatusCode());
actualStatusCode.setValue(statusCode);
} catch (IOException e) {
e.printStackTrace();
}
};
newScheduledThreadPool(1).schedule(crawler::killCrawler, 20, TimeUnit.SECONDS);
newScheduledThreadPool(1).schedule(askIndexIfIsAlive, 10, TimeUnit.SECONDS);
crawler.executeCrawler(crawlerConfiguration, crawlerWireObject);
assertEquals(expectedStatusCode, actualStatusCode.intValue());
}
private CrawlerConfiguration setupCrawler() throws UrlVisitException, ParseException {
String seedUrl = "http://seedurl";
CrawlerConfiguration crawlerConfiguration = buildCrawlerConfiguration(seedUrl, true);
String nextUrl = "";
final long timeStamp = 1L;
final String response = "<html><body><a href=\"http://localhost:49291?request=first\" />yes yes yes</body></html>";
PageData pageData = new PageData(Sets.newHashSet(nextUrl), seedUrl, timeStamp, response, response, "title", transformationMap);
UrlVisitResponse urlVisitResponse = createUrlVisitResponse(seedUrl,response);
setupMocks(seedUrl, response, timeStamp, mockedUrlVisitor, mockedHtmlTransformer, urlVisitResponse, pageData, mockedDataCleaner);
return crawlerConfiguration;
}
private UrlVisitResponse createUrlVisitResponse(String url,String response,Long timestamp) {
return new UrlVisitResponse(response, 200, ImmutableList.of("text/html"), timestamp, url);
}
private UrlVisitResponse createUrlVisitResponse(String url,String response) {
return createUrlVisitResponse(url,response,1L);
}
private UrlVisitResponse createUrlVisitResponse(String url) {
return createUrlVisitResponse(url,"<html><body></body></html>");
}
private CrawlerConfiguration buildCrawlerConfiguration(final String seedUrl) {
return buildCrawlerConfiguration(seedUrl, false);
}
private CrawlerConfiguration buildCrawlerConfiguration(final String seedUrl, final Boolean startElasticNode) {
final String transformationPath = "";
final String cleanDataFilePath = "";
final Integer seconds = 1;
final Integer urlsPer = 3;
final ArrayList<String> filters = Lists.newArrayList();
return new CrawlerConfigurationBuilder().setFilters(filters).setIndexName("").setIndexType("").setIndexNodeHost("").setIndexNodePort(0).setMaxDuration(1L).setSeedUrls(Lists.newArrayList(seedUrl)).setContentRetrievalExpression("").setAttributeToLocateContent("").setCreateEmbeddedElasticNode(startElasticNode).setMaxDepth(1).setTransformationPath(transformationPath).setCleanDataFilePath(cleanDataFilePath).setSeconds(seconds).setUrlsPer(urlsPer).setDisableThrottle(true).createCrawlerConfiguration();
}
}