/*
* Tanaguru - Automated webpage assessment
* Copyright (C) 2008-2015 Tanaguru.org
*
* This file is part of Tanaguru.
*
* Tanaguru is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* Contact us by mail: tanaguru AT tanaguru DOT org
*/
package org.tanaguru.service;
import java.util.*;
import org.apache.log4j.Logger;
import org.tanaguru.crawler.Crawler;
import org.tanaguru.crawler.CrawlerFactory;
import org.tanaguru.entity.audit.Audit;
import org.tanaguru.entity.parameterization.Parameter;
import org.tanaguru.entity.service.audit.AuditDataService;
import org.tanaguru.entity.service.audit.ContentDataService;
import org.tanaguru.entity.service.subject.WebResourceDataService;
import org.tanaguru.entity.subject.WebResource;
import org.springframework.beans.factory.annotation.Autowired;
import org.tanaguru.entity.audit.Content;
import org.tanaguru.entity.audit.RelatedContent;
import org.tanaguru.entity.audit.SSP;
import org.tanaguru.entity.service.parameterization.ParameterDataService;
/**
* Implementation of the crawler service.
*
* @author jkowalczyk
*/
public class CrawlerServiceImpl implements CrawlerService {
private static final Logger LOGGER = Logger.getLogger(CrawlerServiceImpl.class);
private static final int PROCESS_WINDOW = 20;
/**
* The auditDataService instance
*/
private AuditDataService auditDataService;
@Override
@Autowired
public void setAuditDataService(AuditDataService auditDataService) {
this.auditDataService = auditDataService;
}
/**
* The auditDataService instance
*/
private ParameterDataService parameterDataService;
@Autowired
public void setParameterDataService(ParameterDataService parameterDataService) {
this.parameterDataService = parameterDataService;
}
/**
* The contentDataService instance
*/
private ContentDataService contentDataService;
public ContentDataService getContentDataService() {
return contentDataService;
}
@Override
@Autowired
public void setContentDataService(ContentDataService contentDataService) {
this.contentDataService = contentDataService;
}
/**
* The webResourceDataService instance
*/
private WebResourceDataService webResourceDataService;
@Override
public WebResourceDataService getWebResourceDataService() {
return webResourceDataService;
}
@Override
@Autowired
public void setWebResourceDataService(WebResourceDataService webResourceDataService) {
this.webResourceDataService = webResourceDataService;
}
/**
* The crawler factory instance
*/
private CrawlerFactory crawlerFactory;
@Override
@Autowired
public void setCrawlerFactory(CrawlerFactory crawlerFactory) {
this.crawlerFactory = crawlerFactory;
}
/**
* Default constructor
*/
public CrawlerServiceImpl() {
super();
}
@Override
public WebResource crawlPage(Audit audit, String pageUrl) {
Crawler crawler = getCrawlerInstance((Set<Parameter>) audit.getParameterSet(), true);
crawler.setPageURL(pageUrl);
return crawl(crawler, audit, true);
}
/**
* Calls the crawler component process then updates the site.
*
* @param siteUrl the site to crawl
* @param audit the current audit
*
* @return returns the site after modification
*/
@Override
public WebResource crawlSite(Audit audit, String siteUrl) {
Crawler crawler = getCrawlerInstance((Set<Parameter>) audit.getParameterSet(), true);
crawler.setSiteURL(siteUrl);
return crawl(crawler, audit, true);
}
@Override
public WebResource crawlGroupOfPages(Audit audit, String siteUrl, List<String> urlList) {
Crawler crawler = getCrawlerInstance((Set<Parameter>) audit.getParameterSet(), true);
crawler.setSiteURL(siteUrl, urlList);
return crawl(crawler, audit, true);
}
/**
*
* @param crawler
* @param audit
* @return
*/
private WebResource crawl(Crawler crawler, Audit audit, boolean persistOnTheFly) {
crawler.run();
WebResource wr = crawler.getResult();
wr.setAudit(audit);
audit.setSubject(wr);
if (persistOnTheFly) {
//the relation from webresource to audit is refresh, the audit has to
// be persisted first
auditDataService.saveOrUpdate(audit);
setAuditToContent(wr, audit);
removeSummerRomance(audit);
removePageExcedent(wr, audit);
}
return wr;
}
/**
* This method created the relation between the fetched contents and the
* current audit.
*
* @param wr
* @param audit
*/
private void setAuditToContent(WebResource wr, Audit audit) {
// httpStatusCode = -1 means all.
int httpStatusCode = -1;
Long nbOfContent = contentDataService.getNumberOfSSPFromWebResource(wr, httpStatusCode);
Long i = Long.valueOf(0);
Date endProcessDate = null;
Date beginProcessDate = null;
Date endPersistDate;
LOGGER.debug("Number Of SSP From WebResource " + wr.getURL() + " : " + nbOfContent);
while (i.compareTo(nbOfContent) < 0) {
if (LOGGER.isDebugEnabled()) {
beginProcessDate = Calendar.getInstance().getTime();
LOGGER.debug("Set audit to ssp from "
+ i + " to " + (i + PROCESS_WINDOW));
}
Collection<Long> contentIdList
= contentDataService.getSSPIdsFromWebResource(
wr.getId(),
httpStatusCode,
i.intValue(),
PROCESS_WINDOW);
if (LOGGER.isDebugEnabled()) {
endProcessDate = Calendar.getInstance().getTime();
LOGGER.debug("Retrieving " + PROCESS_WINDOW + " SSP took "
+ (endProcessDate.getTime() - beginProcessDate.getTime())
+ " ms");
}
for (Long id : contentIdList) {
contentDataService.saveAuditToContent(id, audit.getId());
}
if (LOGGER.isDebugEnabled()) {
endPersistDate = Calendar.getInstance().getTime();
LOGGER.debug("Persisting " + PROCESS_WINDOW + " SSP took "
+ (endPersistDate.getTime() - endProcessDate.getTime())
+ " ms");
}
i = i + PROCESS_WINDOW;
}
nbOfContent = contentDataService.getNumberOfRelatedContentFromWebResource(wr);
LOGGER.debug("Number Of Related Content From WebResource?" + wr.getURL() + " : " + nbOfContent);
i = Long.valueOf(0);
while (i.compareTo(nbOfContent) < 0) {
if (LOGGER.isDebugEnabled()) {
beginProcessDate = Calendar.getInstance().getTime();
LOGGER.debug("Set audit to relatedContent from "
+ i + " to " + (i + PROCESS_WINDOW));
}
Collection<Long> contentIdList
= contentDataService.getRelatedContentIdsFromWebResource(
wr.getId(),
i.intValue(),
PROCESS_WINDOW);
if (LOGGER.isDebugEnabled()) {
endProcessDate = Calendar.getInstance().getTime();
LOGGER.debug("Retrieving " + PROCESS_WINDOW + " relatedContent took "
+ (endProcessDate.getTime() - beginProcessDate.getTime())
+ " ms");
}
for (Long id : contentIdList) {
contentDataService.saveAuditToContent(id, audit.getId());
}
if (LOGGER.isDebugEnabled()) {
endPersistDate = Calendar.getInstance().getTime();
LOGGER.debug("Persisting " + PROCESS_WINDOW + " relatedContent took "
+ (endPersistDate.getTime() - endProcessDate.getTime())
+ " ms");
}
i = i + PROCESS_WINDOW;
}
webResourceDataService.saveOrUpdate(wr);
}
/**
* During the crawl, more Webresources and Contents than expected may
* be retrieved. This methods delete them.
*
* @param webResource
* @param audit
*/
private void removePageExcedent(WebResource wr, Audit audit){
int maxNumberOfCrawlPage = getMaxNumberOfCrawlPageFromAuditParameter(audit);
// httpStatusCode = -1 means all.
int httpStatusCode = -1;
Long nbOfContent = contentDataService.getNumberOfSSPFromWebResource(wr, httpStatusCode);
if (maxNumberOfCrawlPage == -1 || nbOfContent < maxNumberOfCrawlPage) {
return;
}
Long i = (long) maxNumberOfCrawlPage +1 ;
Long fromValue = (long) maxNumberOfCrawlPage ;
LOGGER.info("Deleting "+ (nbOfContent - maxNumberOfCrawlPage)+" content excedent regarding user limit");
while (i.compareTo(nbOfContent) < 0) {
Collection<Long> contentIdList
= contentDataService.getSSPIdsFromWebResource(
wr.getId(),
httpStatusCode,
fromValue.intValue(),
PROCESS_WINDOW);
LOGGER.info("Delete excedent content from " +i + " to "+(i+PROCESS_WINDOW));
for (Long contentId : contentIdList){
Content content = contentDataService.read(contentId);
if (content instanceof SSP) {
webResourceDataService.delete(((SSP)content).getPage().getId());
}
contentDataService.delete(contentId);
}
i = i + PROCESS_WINDOW;
}
}
/**
* During the crawl, Webresources and Contents are created. Contents can be
* of 2 types : SSP or relatedContent. A SSP is linked to a webResource and
* a relatedContent is linked to a SSP. The relation between a ssp and a
* relatedContent is not known when fetching. So we need to link all the
* relatedContent to any SSP to be able to link them to the current audit.
* At the end of the crawl, after the creation of the relation between a
* content and an audit, we can "clean" this fake relation.
*
* This method were supposed to be called removedFakeRelation but thanks to
* the scottish guy, this method is now called removerSummerRomance.
*
* @param webResource
* @param audit
*/
private void removeSummerRomance(Audit audit){
Set<RelatedContent> relatedContentSet =
(Set<RelatedContent>)contentDataService.getRelatedContentFromAudit(audit);
for (RelatedContent relatedContent : relatedContentSet) {
if (LOGGER.isDebugEnabled()) {
LOGGER.debug(" deleteContentRelationShip between of " + ((Content)relatedContent).getURI());
contentDataService.deleteContentRelationShip(((Content)relatedContent).getId());
}
}
}
/**
*
* @return a crawler instance.
*/
private Crawler getCrawlerInstance(Set<Parameter> paramSet, boolean persistOnTheFly) {
return crawlerFactory.create(paramSet, persistOnTheFly);
}
private int getMaxNumberOfCrawlPageFromAuditParameter(Audit audit) {
try {
if (parameterDataService.getParameter(audit, "MAX_DOCUMENTS") != null) {
return Integer.valueOf(parameterDataService.getParameter(audit, "MAX_DOCUMENTS").getValue());
}
} catch (NumberFormatException nfe) {
return -1;
}
return -1;
}
}