/*
* Tanaguru - Automated webpage assessment
* Copyright (C) 2008-2015 Tanaguru.org
*
* This file is part of Tanaguru.
*
* Tanaguru is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* Contact us by mail: tanaguru AT tanaguru DOT org
*/
package org.tanaguru.service;
import java.util.*;
import junit.framework.TestCase;
import org.apache.http.HttpStatus;
import org.tanaguru.crawler.CrawlerFactory;
import org.tanaguru.crawler.CrawlerFactoryImpl;
import org.tanaguru.crawler.util.*;
import org.tanaguru.entity.audit.Audit;
import org.tanaguru.entity.audit.AuditImpl;
import org.tanaguru.entity.audit.Content;
import org.tanaguru.entity.audit.SSP;
import org.tanaguru.entity.parameterization.*;
import org.tanaguru.entity.service.audit.AuditDataService;
import org.tanaguru.entity.service.audit.ContentDataService;
import org.tanaguru.entity.service.parameterization.ParameterDataService;
import org.tanaguru.entity.service.subject.WebResourceDataService;
import org.tanaguru.entity.subject.WebResource;
import org.tanaguru.service.mock.MockAuditDataService;
import org.tanaguru.service.mock.MockContentDataService;
import org.tanaguru.service.mock.MockParameterDataService;
import org.tanaguru.service.mock.MockWebResourceDataService;
/**
* This class is a functionnal test class of the crawler service.
*
* @author jkowalczyk
*/
public class CrawlerServiceImplTest extends TestCase {
private static final String FULL_SITE_CRAWL_URL_KEY = "full-site-crawl-url";
private static final String ROBOTS_RESTRICTED_CRAWL_URL_KEY =
"robots-restricted-crawl-url";
private static final String SITES_URL_BUNDLE_NAME = "sites-url";
private static final String FULL_SITE_CRAWL_CONF_FILE_PATH =
"src/test/resources/full-site-crawl-conf/";
private static final String PAGE_CRAWL_CONF_FILE_PATH =
"src/test/resources/page-crawl-conf/";
private static final String PAGE_NAME_LEVEL1 = "page-1.html";
private static final String PAGE_NAME_LEVEL2 = "page-2.html";
private static final String FORBIDDEN_PAGE_NAME = "page-access-forbidden-for-robots.html";
private final ResourceBundle bundle =
ResourceBundle.getBundle(SITES_URL_BUNDLE_NAME);
private CrawlerService crawlerService;
private CrawlerFactory crawlerFactory;
private WebResourceDataService mockWebResourceDataService;
private ContentDataService mockContentDataService;
private AuditDataService mockAuditDataService;
private ParameterDataService mockParameterDataService;
CrawlConfigurationUtils ccu = CrawlConfigurationUtils.getInstance();
public CrawlerServiceImplTest(String testName) {
super(testName);
}
@Override
protected void setUp() throws Exception {
super.setUp();
mockWebResourceDataService = new MockWebResourceDataService();
mockContentDataService = new MockContentDataService();
mockAuditDataService = new MockAuditDataService();
mockParameterDataService = new MockParameterDataService();
crawlerFactory = new CrawlerFactoryImpl();
crawlerFactory.setOutputDir("/tmp");
((CrawlerFactoryImpl) crawlerFactory).setWebResourceDataService(mockWebResourceDataService);
((CrawlerFactoryImpl) crawlerFactory).setContentDataService(mockContentDataService);
crawlerService = new CrawlerServiceImpl();
crawlerService.setCrawlerFactory(crawlerFactory);
crawlerService.setWebResourceDataService(mockWebResourceDataService);
crawlerService.setAuditDataService(mockAuditDataService);
crawlerService.setContentDataService(mockContentDataService);
((CrawlerServiceImpl)crawlerService).setParameterDataService(mockParameterDataService);
initCrawlConfigUtils();
}
@Override
protected void tearDown() throws Exception {
super.tearDown();
}
/**
*
* @param siteUrl
* @param depth
* @param regexp
* @param maxDuration
* @param maxDocuments
* @param proxyHost
* @param proxyPort
* @return
*/
private List<Content> initialiseAndLaunchCrawl(
String siteUrl,
String depth,
String exlusionRegexp,
String inlusionRegexp,
String maxDuration,
String maxDocuments) {
Audit audit = new AuditImpl();
audit.setParameterSet(setCrawlParameters(depth, exlusionRegexp, inlusionRegexp, maxDuration, maxDocuments));
WebResource site = crawlerService.crawlSite(audit, siteUrl);
Collection<Long> contentListId = mockContentDataService.getSSPIdsFromWebResource(site.getId(), HttpStatus.SC_OK, 0, 10);
List<Content> contentList = new ArrayList();
for (Long id : contentListId) {
Content content = mockContentDataService.readWithRelatedContent(id, false);
if (content != null) {
System.out.println(content.getURI() + " " + content.getClass());
contentList.add(content);
}
}
return contentList;
}
public void testCrawl_SiteWithDepthLevel0Option() {
System.out.println("crawl_full_site_With_Depth_Level0_Option");
crawlerFactory.setCrawlConfigFilePath(FULL_SITE_CRAWL_CONF_FILE_PATH);
String siteUrl = bundle.getString(FULL_SITE_CRAWL_URL_KEY);
List<Content> contentList = initialiseAndLaunchCrawl(siteUrl, "0", "", "", "", "");
assertEquals(1, contentList.size());
Set<String> urlSet = getUrlSet(contentList);
assertTrue(urlSet.contains(siteUrl));
}
public void testCrawl_SiteWithDepthLevel1Option() {
System.out.println("crawl_full_site_With_Depth_Level1_Option");
crawlerFactory.setCrawlConfigFilePath(FULL_SITE_CRAWL_CONF_FILE_PATH);
String siteUrl = bundle.getString(FULL_SITE_CRAWL_URL_KEY);
List<Content> contentList = initialiseAndLaunchCrawl(siteUrl, "1", "", "", "", "");
assertEquals(3, contentList.size());
Set<String> urlSet = getUrlSet(contentList);
assertTrue(urlSet.contains(siteUrl));
assertTrue(urlSet.contains(siteUrl + PAGE_NAME_LEVEL1));
assertTrue(urlSet.contains(siteUrl + FORBIDDEN_PAGE_NAME));
}
public void testCrawl_SiteWithRegexpExclusionOption() {
System.out.println("crawl_full_site_With_Regexp_Exclusion_Option");
crawlerFactory.setCrawlConfigFilePath(FULL_SITE_CRAWL_CONF_FILE_PATH);
String siteUrl = bundle.getString(FULL_SITE_CRAWL_URL_KEY);
List<Content> contentList = initialiseAndLaunchCrawl(siteUrl, "4", ".html", "", "", "");
assertEquals(1, contentList.size());
Set<String> urlSet = getUrlSet(contentList);
assertTrue(urlSet.contains(siteUrl));
}
public void testCrawl_SiteWithRegexpInclusionOption() {
System.out.println("crawl_full_site_With_Regexp_Inclusion_Option");
crawlerFactory.setCrawlConfigFilePath(FULL_SITE_CRAWL_CONF_FILE_PATH);
String siteUrl = bundle.getString(FULL_SITE_CRAWL_URL_KEY)+"page-1.html";
List<Content> contentList = initialiseAndLaunchCrawl(siteUrl, "2", "", "page-", "", "10");
assertEquals(3, contentList.size());
Set<String> urlSet = getUrlSet(contentList);
assertTrue(urlSet.contains(siteUrl));
assertTrue(urlSet.contains(bundle.getString(FULL_SITE_CRAWL_URL_KEY) + PAGE_NAME_LEVEL2));
assertTrue(urlSet.contains(bundle.getString(FULL_SITE_CRAWL_URL_KEY) + FORBIDDEN_PAGE_NAME));
}
public void testCrawl_SiteWithRegexpInclusionOption2() {
System.out.println("crawl_full_site_With_Regexp_Inclusion_Option 2");
crawlerFactory.setCrawlConfigFilePath(FULL_SITE_CRAWL_CONF_FILE_PATH);
String siteUrl = bundle.getString(FULL_SITE_CRAWL_URL_KEY)+"page-1.html";
List<Content> contentList = initialiseAndLaunchCrawl(siteUrl, "2", "", "page-\\d", "", "10");
assertEquals(2, contentList.size());
Set<String> urlSet = getUrlSet(contentList);
assertTrue(urlSet.contains(siteUrl));
assertTrue(urlSet.contains(bundle.getString(FULL_SITE_CRAWL_URL_KEY) + PAGE_NAME_LEVEL2));
}
public void testCrawl_SiteWithRegexpInclusionOption3() {
System.out.println("crawl_full_site_With_Regexp_Inclusion_Option 3");
crawlerFactory.setCrawlConfigFilePath(FULL_SITE_CRAWL_CONF_FILE_PATH);
String siteUrl = bundle.getString(FULL_SITE_CRAWL_URL_KEY);
List<Content> contentList = initialiseAndLaunchCrawl(siteUrl, "2", "", "page-\\d", "", "10");
assertEquals(3, contentList.size());
Set<String> urlSet = getUrlSet(contentList);
assertTrue(urlSet.contains(siteUrl));
assertTrue(urlSet.contains(siteUrl + PAGE_NAME_LEVEL1));
assertTrue(urlSet.contains(siteUrl + PAGE_NAME_LEVEL2));
}
public void testCrawl_SiteWithRegexpExclusionOption2() {
System.out.println("crawl_full_site_With_Regexp_Exclusion_Option2");
crawlerFactory.setCrawlConfigFilePath(FULL_SITE_CRAWL_CONF_FILE_PATH);
String siteUrl = bundle.getString(FULL_SITE_CRAWL_URL_KEY);
List<Content> contentList = initialiseAndLaunchCrawl(siteUrl, "4", "robot", "", "", "");
assertEquals(3, contentList.size());
Set<String> urlSet = getUrlSet(contentList);
assertTrue(urlSet.contains(siteUrl));
assertTrue(urlSet.contains(siteUrl + PAGE_NAME_LEVEL1));
assertTrue(urlSet.contains(siteUrl + PAGE_NAME_LEVEL2));
}
public void testCrawl_SiteWithRegexpExclusionOption3() {
System.out.println("crawl_full_site_With_Regexp_Exclusion_Option3");
crawlerFactory.setCrawlConfigFilePath(FULL_SITE_CRAWL_CONF_FILE_PATH);
String siteUrl = bundle.getString(FULL_SITE_CRAWL_URL_KEY);
List<Content> contentList = initialiseAndLaunchCrawl(siteUrl, "4", "robot;page-2", "", "", "");
assertEquals(2, contentList.size());
Set<String> urlSet = getUrlSet(contentList);
assertTrue(urlSet.contains(siteUrl));
assertTrue(urlSet.contains(siteUrl + PAGE_NAME_LEVEL1));
}
/**
* * Test the crawl of a site without robots.txt file
*/
public void testCrawl_Site() {
System.out.println("crawl_full_site");
crawlerFactory.setCrawlConfigFilePath(FULL_SITE_CRAWL_CONF_FILE_PATH);
String siteUrl = bundle.getString(FULL_SITE_CRAWL_URL_KEY);
List<Content> contentList = initialiseAndLaunchCrawl(siteUrl, "3", "", "", "", "");
assertEquals(4, contentList.size());
Set<String> urlSet = getUrlSet(contentList);
assertTrue(urlSet.contains(siteUrl));
assertTrue(urlSet.contains(siteUrl + PAGE_NAME_LEVEL1));
assertTrue(urlSet.contains(siteUrl + PAGE_NAME_LEVEL2));
assertTrue(urlSet.contains(siteUrl + FORBIDDEN_PAGE_NAME));
}
/**
* Test the crawl of a page
*/
public void testCrawl_Page() {
System.out.println("crawl_page");
crawlerFactory.setCrawlConfigFilePath(PAGE_CRAWL_CONF_FILE_PATH);
String siteUrl = bundle.getString(FULL_SITE_CRAWL_URL_KEY);
Audit audit = new AuditImpl();
audit.setParameterSet(setCrawlParameters("3", "", "", "", ""));
WebResource page = crawlerService.crawlPage(audit, siteUrl);
Collection<Long> contentListId = mockContentDataService.getSSPIdsFromWebResource(page.getId(), HttpStatus.SC_OK, 0, 10);
List<Content> contentList = new ArrayList<>();
for (Long id : contentListId) {
contentList.add(mockContentDataService.readWithRelatedContent(id, false));
}
assertEquals(1, contentList.size());
Set<String> urlSet = getUrlSet(contentList);
assertTrue(urlSet.contains(siteUrl));
assertFalse(urlSet.contains(siteUrl + PAGE_NAME_LEVEL1));
assertFalse(urlSet.contains(siteUrl + PAGE_NAME_LEVEL2));
assertFalse(urlSet.contains(siteUrl + FORBIDDEN_PAGE_NAME));
}
/**
* Test the crawl of a site with robots.txt file
*/
public void testCrawl_Site_With_Robots() {
System.out.println("crawl_site_with_robots");
crawlerFactory.setCrawlConfigFilePath(FULL_SITE_CRAWL_CONF_FILE_PATH);
String siteUrl = bundle.getString(ROBOTS_RESTRICTED_CRAWL_URL_KEY);
List<Content> contentList = initialiseAndLaunchCrawl(siteUrl, "3", "", "", "", "");
assertEquals(3, contentList.size() + ((SSP) contentList.iterator().next()).getRelatedContentSet().size());
Set<String> urlSet = getUrlSet(contentList);
assertTrue(urlSet.contains(siteUrl));
assertTrue(urlSet.contains(siteUrl + PAGE_NAME_LEVEL1));
assertTrue(urlSet.contains(siteUrl + PAGE_NAME_LEVEL2));
assertFalse(urlSet.contains(siteUrl + FORBIDDEN_PAGE_NAME));
}
private Set<String> getUrlSet(List<Content> contentList) {
Set<String> urlSet = new HashSet<>();
for (Content content : contentList) {
urlSet.add(content.getURI());
}
return urlSet;
}
/**
*
* @param depth
* @param exclusionRegexp
* @param inclusionRegexp
* @param maxDuration
* @param maxDocuments
* @param proxyHost
* @param proxyPort
* @return The set of Parameters regarding options set as argument
*/
private Set<Parameter> setCrawlParameters(
String depth,
String exclusionRegexp,
String inclusionRegexp,
String maxDuration,
String maxDocuments) {
Set<Parameter> crawlParameters = new HashSet<>();
ParameterFamily pf = new ParameterFamilyImpl();
pf.setParameterFamilyCode("CRAWLER");
//DEPTH
ParameterElement ped = new ParameterElementImpl();
ped.setParameterElementCode("DEPTH");
Parameter pedValue = new ParameterImpl();
pedValue.setParameterElement(ped);
pedValue.setValue(depth);
crawlParameters.add(pedValue);
//EXCLUSION_REGEX
ParameterElement peer = new ParameterElementImpl();
peer.setParameterElementCode("EXCLUSION_REGEX");
Parameter peerValue = new ParameterImpl();
peerValue.setParameterElement(peer);
peerValue.setValue(exclusionRegexp);
crawlParameters.add(peerValue);
//INCLUSION_REGEX
ParameterElement peir = new ParameterElementImpl();
peir.setParameterElementCode("INCLUSION_REGEX");
Parameter peirValue = new ParameterImpl();
peirValue.setParameterElement(peir);
peirValue.setValue(inclusionRegexp);
crawlParameters.add(peirValue);
//MAX_DURATION
ParameterElement pemdu = new ParameterElementImpl();
pemdu.setParameterElementCode("MAX_DURATION");
Parameter pemduValue = new ParameterImpl();
pemduValue.setParameterElement(pemdu);
pemduValue.setValue(maxDuration);
crawlParameters.add(pemduValue);
//MAX_DOCUMENTS
ParameterElement pemdo = new ParameterElementImpl();
pemdo.setParameterElementCode("MAX_DOCUMENTS");
Parameter pemdoValue = new ParameterImpl();
pemdoValue.setParameterElement(pemdo);
pemdoValue.setValue(maxDocuments);
crawlParameters.add(pemdoValue);
System.setProperty("proxyHost", "");
System.setProperty("proxyPort", "");
System.setProperty("proxyUser", "");
System.setProperty("proxyPassword", "");
return crawlParameters;
}
/**
* Set up the CrawlConfigUtils instance with modifiers
*/
private void initCrawlConfigUtils() {
HeritrixParameterValueModifier urlModifier = new HeritrixParameterValueModifier();
urlModifier.setAttributeName("key");
urlModifier.setAttributeValue("seeds.textSource.value");
urlModifier.setIdBeanParent("longerOverrides");
urlModifier.setElementName("prop");
ccu.setUrlModifier(urlModifier);
Map<String, HeritrixConfigurationModifier> modifierMap = new HashMap<>();
List<HeritrixConfigurationModifier> proxyModifierList = new ArrayList<>();
HeritrixConfigurationModifier depthModifier = new HeritrixAttributeValueModifier();
depthModifier.setAttributeName("name");
depthModifier.setAttributeValue("maxHops");
depthModifier.setIdBeanParent("tooManyHopsDecideRule");
depthModifier.setElementName("property");
modifierMap.put("DEPTH", depthModifier);
HeritrixConfigurationModifier maxDocumentsModifier = new HeritrixAttributeValueModifier();
maxDocumentsModifier.setAttributeName("name");
maxDocumentsModifier.setAttributeValue("maxDocumentsDownload");
maxDocumentsModifier.setIdBeanParent("crawlLimiter");
maxDocumentsModifier.setElementName("property");
modifierMap.put("MAX_DOCUMENTS", maxDocumentsModifier);
HeritrixConfigurationModifier maxDurationModifier = new HeritrixAttributeValueModifier();
maxDurationModifier.setAttributeName("name");
maxDurationModifier.setAttributeValue("maxTimeSeconds");
maxDurationModifier.setIdBeanParent("crawlLimiter");
maxDurationModifier.setElementName("property");
modifierMap.put("MAX_DURATION", maxDurationModifier);
HeritrixConfigurationModifier proxyHostModifier = new HeritrixAttributeValueModifierAndEraserFromProperty();
proxyHostModifier.setAttributeName("name");
proxyHostModifier.setAttributeValue("httpProxyHost");
proxyHostModifier.setIdBeanParent("fetchHttp");
proxyHostModifier.setElementName("property");
((HeritrixAttributeValueModifierAndEraserFromProperty)proxyHostModifier).setPropertyValue(System.getProperty("proxyHost"));
proxyModifierList.add(proxyHostModifier);
HeritrixConfigurationModifier proxyPortModifier = new HeritrixAttributeValueModifierAndEraserFromProperty();
proxyPortModifier.setAttributeName("name");
proxyPortModifier.setAttributeValue("httpProxyPort");
proxyPortModifier.setIdBeanParent("fetchHttp");
proxyPortModifier.setElementName("property");
((HeritrixAttributeValueModifierAndEraserFromProperty)proxyPortModifier).setPropertyValue(System.getProperty("proxyPort"));
proxyModifierList.add(proxyPortModifier);
HeritrixConfigurationModifier proxyUserModifier = new HeritrixAttributeValueModifierAndEraserFromProperty();
proxyUserModifier.setAttributeName("name");
proxyUserModifier.setAttributeValue("httpProxyUser");
proxyUserModifier.setIdBeanParent("fetchHttp");
proxyUserModifier.setElementName("property");
((HeritrixAttributeValueModifierAndEraserFromProperty)proxyUserModifier).setPropertyValue(System.getProperty("proxyUser"));
proxyModifierList.add(proxyUserModifier);
HeritrixConfigurationModifier proxyPasswordModifier = new HeritrixAttributeValueModifierAndEraserFromProperty();
proxyPasswordModifier.setAttributeName("name");
proxyPasswordModifier.setAttributeValue("httpProxyPassword");
proxyPasswordModifier.setIdBeanParent("fetchHttp");
proxyPasswordModifier.setElementName("property");
((HeritrixAttributeValueModifierAndEraserFromProperty)proxyPasswordModifier).setPropertyValue(System.getProperty("proxyPassword"));
proxyModifierList.add(proxyPasswordModifier);
HeritrixConfigurationModifier exclusionRegexpModifier = new HeritrixContainsRegexpParameterValueModifier();
exclusionRegexpModifier.setXpathExpression("//list[ancestor::property/@name='regexList' and ancestor::bean/@id='matchesListRegexDecideRule']");
exclusionRegexpModifier.setElementName("value");
modifierMap.put("EXCLUSION_REGEX", exclusionRegexpModifier);
HeritrixConfigurationModifier inclusionRegexpModifier = new HeritrixContainsRegexpParameterValueModifier();
inclusionRegexpModifier.setXpathExpression("//list[ancestor::property/@name='regexList' and ancestor::bean/@id='inclusionListRegexDecideRule']");
inclusionRegexpModifier.setElementName("value");
modifierMap.put("INCLUSION_REGEX", inclusionRegexpModifier);
ccu.setParameterModifierMap(modifierMap);
ccu.setProxyModifierList(proxyModifierList);
}
// /**
// * Test of crawl method, of class CrawlerServiceImpl.
// */
// public void testCrawl_Site_With_SiteMap() {
// System.out.println("crawl_site_with_sitemap");
// crawler.setSiteURL(null);
// CrawlerServiceImpl instance = new CrawlerServiceImpl();
// Site expResult = null;
// Site result = instance.crawl(site);
// }
}