/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package org.opens.slurpmanager.crawler;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.TimeZone;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.lang.StringEscapeUtils;
import org.archive.crawler.framework.CrawlJob;
import org.archive.net.UURIFactory;
import org.opens.slurpmanager.exception.WrongURIException;
import org.opens.slurpmanager.handler.WebarchiveHandlerImpl;
import org.opens.slurpmanager.scope.CrawlScope;
/**
*
* @author jkowalczyk
*/
public class CrawlerImpl implements Crawler {
private static final String UNREACHABLE_URI_MSG = "The url is unreachable";
private CrawlScope scope;
private String url;
private File currentJobOutputDir;
private static final String urlStrToReplace = "# URLS HERE";
protected CrawlJob crawlJob;
protected String resultFilePath;
private String dateFormat = "yyyyMMddHHmmss";
private SimpleDateFormat sdf = new SimpleDateFormat(dateFormat);
private static final Logger logger = Logger.getLogger(CrawlerImpl.class.getName());
private static final int ONE_SECOND = 1000;
/**
*
*/
private String crawlConfigFilePath = "/etc/slurp-manager/context/crawler/";
public String getCrawlConfigFilePath() {
return crawlConfigFilePath;
}
public void setCrawlConfigFilePath(String crawlConfigFilePath) {
this.crawlConfigFilePath = crawlConfigFilePath;
}
/**
*
*/
protected String heritrixFileName = "slurp-crawler-beans.cxml";
public String getHeritrixFileName() {
return heritrixFileName;
}
public void setHeritrixFileName(String heritrixFileName) {
this.heritrixFileName = heritrixFileName;
}
/**
*
*/
protected String warcDir = "latest/warcs";
public String getWarcDir() {
return warcDir;
}
public void setWarcDir(String warcDir) {
this.warcDir = warcDir;
}
/**
*
*/
protected String warcExtension = ".warc";
public String getWarcExtension() {
return warcExtension;
}
public void setWarcExtension(String warcExtension) {
this.warcExtension = warcExtension;
}
/**
*
*/
private String temporaryDir = "/var/tmp/slurp-manager";
public String getTemporaryDir() {
return this.temporaryDir;
}
public void setTemporaryDir(String temporaryDir) {
this.temporaryDir = temporaryDir;
}
/**
*
*/
private File outputDir = new File("/var/tmp/wayback/warcs");
public String getOutputDir() {
return this.outputDir.getName();
}
public void setOutputDir(String outputDir) {
this.outputDir = new File(outputDir);
}
/**
*
*/
private String archivePrefix = "http://localhost:8080/wayback/wayback";
public String getArchivePrefix() {
return archivePrefix;
}
public void setArchivePrefix(String archivePrefix) {
this.archivePrefix = archivePrefix;
}
private boolean deleteContext = true;
public boolean getDeleteContext() {
return deleteContext;
}
public void setDeleteContext(boolean deleteContext) {
this.deleteContext = deleteContext;
}
private Date currentDate;
@Override
public Date getResultDate() {
return currentDate;
}
@Override
public String getResult() {
sdf.setTimeZone(TimeZone.getTimeZone("GMT"));
StringBuilder strb = new StringBuilder();
strb.append(archivePrefix);
strb.append(sdf.format(currentDate));
strb.append("/");
strb.append(url);
return strb.toString().replaceAll("/http:/", "").replaceAll("/https:/", "");
}
private String errorMessage;
public String getErrorMessage() {
return errorMessage;
}
public void setErrorMessage(String errorMessage) {
this.errorMessage = errorMessage;
}
@Override
public void setUrl(String url) {
this.url = url;
}
@Override
public void setScope(CrawlScope scope) {
this.scope = scope;
}
@Override
public boolean run() {
setErrorMessage("");
try {
this.crawlJob = new CrawlJob(initializeCrawlContext(url));
} catch (WrongURIException ex){
errorMessage = ex.getMessage();
return false;
}
if (crawlJob.isLaunchable()) {
Logger.getLogger(WebarchiveHandlerImpl.class.getName()).info(
"crawljob is launchable");
crawlJob.checkXML();
crawlJob.launch();
if (!crawlJob.isRunning()) {
try {
Thread.sleep(ONE_SECOND);
} catch (InterruptedException ex) {
Logger.getLogger(
WebarchiveHandlerImpl.class.getName()).log(Level.SEVERE, null, ex);
errorMessage = ex.getMessage();
return false;
}
}
}
Logger.getLogger(WebarchiveHandlerImpl.class.getName()).info(
"is crawlJob running? " + crawlJob.isRunning());
while (crawlJob.isRunning()) {
try {
if (crawlJob.isUnpausable()) {
crawlJob.getCrawlController().getFrontier().run();
}
Logger.getLogger(WebarchiveHandlerImpl.class.getName()).info(
"crawljob is running");
Thread.sleep(ONE_SECOND * 2);
} catch (InterruptedException e) {
errorMessage = e.getMessage();
return false;
}
}
crawlJob.terminate();
Logger.getLogger(WebarchiveHandlerImpl.class.getName()).info(
"crawljob terminated");
if (crawlJob.teardown()) {
Logger.getLogger(WebarchiveHandlerImpl.class.getName()).info(
"crawljob teardown");
if (!copyWarcFileIntoRepository()){
errorMessage = UNREACHABLE_URI_MSG + " : " + url;
Logger.getLogger(WebarchiveHandlerImpl.class.getName()).warning(errorMessage);
return false;
}
removeConfigFile(currentJobOutputDir);
}
return true;
}
/**
* This method initializes the heritrix context before starting the crawl
* @return
*/
private File initializeCrawlContext(String url) throws WrongURIException{
// Create one directory
currentDate = new Date();
currentJobOutputDir = new File(temporaryDir + "/" + "crawl" + "-" + currentDate.getTime());
if (!currentJobOutputDir.exists()) {
boolean success = currentJobOutputDir.mkdir();
if (success) {
Logger.getLogger(WebarchiveHandlerImpl.class.getName()).info(
"Directory: " + currentJobOutputDir + " created");
}
}
try {
BufferedReader in = new BufferedReader(
new FileReader(crawlConfigFilePath + "/" + heritrixFileName));
String c;
StringBuffer newContextFile = new StringBuffer();
while ((c = in.readLine()) != null) {
if (c.equalsIgnoreCase(urlStrToReplace)) {
try {
newContextFile.append(UURIFactory.getInstance(StringEscapeUtils.escapeXml(url)).toString());
} catch (IOException ex) {
Logger.getLogger(WebarchiveHandlerImpl.class.getName()).severe(ex.getMessage());
throw new WrongURIException(ex.getMessage());
}
} else {
newContextFile.append(c);
}
newContextFile.append("\r");
}
FileWriter fw = new FileWriter(currentJobOutputDir.getPath() + "/" + heritrixFileName);
fw.write(newContextFile.toString());
fw.close();
in.close();
} catch (IOException ex) {
Logger.getLogger(WebarchiveHandlerImpl.class.getName()).severe(ex.getMessage());
}
return new File(currentJobOutputDir.getPath() + "/" + heritrixFileName);
}
/**
* Clean-up configuration files before leaving
* @param file
* @return
*/
private boolean removeConfigFile(File file) {
if (deleteContext) {
if (file.exists()) {
File[] files = file.listFiles();
for (int i = 0; i < files.length; i++) {
if (files[i].isDirectory()) {
removeConfigFile(files[i]);
} else {
files[i].delete();
}
}
}
return (file.delete());
}
return true;
}
/**
* Copy the result of the crawl (warc file) into wayback directory
* @param file
* @return
*/
private boolean copyWarcFileIntoRepository() {
logger.info("copyWarcFileIntoRepository");
boolean result = false;
logger.info(currentJobOutputDir.getAbsolutePath() + "/" + warcDir);
File warcFileDir = new File(currentJobOutputDir + "/" + warcDir);
File sourceWarcFile = null;
if (warcFileDir.exists()) {
File[] files = warcFileDir.listFiles();
if (files.length == 1) {
String fileName = files[0].getName();
if (fileName.substring(fileName.lastIndexOf("."), fileName.length()).equalsIgnoreCase(warcExtension)) {
sourceWarcFile = new File(warcFileDir.getPath() + "/" + fileName);
logger.info("sourceWarcFile " + sourceWarcFile);
}
}
} else {
logger.info("warcDirFIle does not exist");
}
FileInputStream sourceFile = null;
FileOutputStream destinationFile = null;
if (sourceWarcFile != null) {
logger.info("sourceWarcFile != null");
try {
sourceFile = new FileInputStream(sourceWarcFile);
destinationFile = new FileOutputStream(outputDir + "/" + sourceWarcFile.getName());
logger.info(outputDir + "/" + sourceWarcFile.getName());
byte buffer[] = new byte[512 * 1024];
int nbLecture;
while ((nbLecture = sourceFile.read(buffer)) != -1) {
destinationFile.write(buffer, 0, nbLecture);
}
result = true;
} catch (FileNotFoundException f) {
result = false;
Logger.getLogger(WebarchiveHandlerImpl.class.getName()).severe(f.getMessage());
} catch (IOException e) {
result = false;
Logger.getLogger(WebarchiveHandlerImpl.class.getName()).severe(e.getMessage());
} finally {
try {
sourceFile.close();
} catch (Exception e) {
Logger.getLogger(WebarchiveHandlerImpl.class.getName()).severe(e.getMessage());
}
try {
destinationFile.close();
} catch (Exception e) {
Logger.getLogger(WebarchiveHandlerImpl.class.getName()).severe(e.getMessage());
}
}
saveArchiveDate(sourceWarcFile.getName());
logger.info("saveArchiveDate " + sourceWarcFile.getAbsolutePath());
} else {
result = false;
}
if (result) {
saveArchiveDate(sourceWarcFile.getName());
logger.info("saveArchiveDate " + sourceWarcFile.getAbsolutePath());
}
logger.info("End of CopyWarcFileIntoRepository");
return result;
}
/**
* Save the date of the archive from its name
* assuming that the archive name respects the following pattern
* ${prefix}-${fetch-time}-${index}.warc
* The fetch time is located between the two '-' characters.
*/
private void saveArchiveDate(String archiveName) {
sdf.setTimeZone(TimeZone.getTimeZone("GMT"));
try {
currentDate = sdf.parse(archiveName.substring(archiveName.indexOf('-') + 1,
archiveName.lastIndexOf('-')));
} catch (ParseException ex) {
Logger.getLogger(CrawlerImpl.class.getName()).log(Level.SEVERE, null, ex);
}
if (currentDate != null) {
//wayback seems to add 1 second to the date comparing to heritrix date
currentDate.setTime(currentDate.getTime() + ONE_SECOND);
} else {
currentDate = new Date();
}
}
}