package eu.leads.processor.utils;
import eu.leads.crawler.*;
import eu.leads.crawler.concurrent.Queue;
import eu.leads.crawler.download.DefaultDownloader;
import eu.leads.crawler.download.DefaultDownloaderController;
import eu.leads.crawler.download.DefaultProxyController;
import eu.leads.crawler.parse.DefaultParser;
import eu.leads.crawler.parse.DefaultParserController;
import eu.leads.crawler.utils.Infinispan;
import eu.leads.processor.Module;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import javax.jms.Message;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.Proxy;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import static java.lang.System.getProperties;
/**
* Created with IntelliJ IDEA.
* User: vagvaz
* Date: 11/7/13
* Time: 4:12 PM
* To change this template use File | Settings | File Templates.
*/
public class PersistentCrawlingModule extends Module {
private static final Log log = LogFactory.getLog(PersistentCrawl.class.getName());
CrawlerController crawlerController;
public PersistentCrawlingModule(String url, String name) throws Exception {
super(url, name);
}
@Override
protected void run() throws Exception {
List<Proxy> proxies = new ArrayList<Proxy>();
String seed = "http://www.economist.com/";
ArrayList<String> words = new ArrayList<String>();
int ncrawlers = 1;
int ndays = 31;
try{
Properties properties = getProperties();
properties.load(PersistentCrawl.class.getClassLoader().getResourceAsStream("config.properties"));
log.info("Found properties file.");
} catch (IOException e) {
log.info("Found no config.properties file; defaulting.");
}
if(getProperties().containsKey("seed")){
seed = getProperties().getProperty("seed");
log.info("Seed : " + seed);
}
if(getProperties().containsKey("words")){
for(String w : getProperties().get("words").toString().split(",")){
log.info("Adding word :"+w);
words.add(w);
}
}else{
words.add("Obama");
}
Infinispan.start();
if(getProperties().containsKey("ncrawlers")){
ncrawlers = Integer.valueOf(getProperties().getProperty("ncrawlers"));
log.info("Using "+ncrawlers+" crawler(s)");
}
if(getProperties().containsKey("ndays")){
ndays = Integer.valueOf(getProperties().getProperty("ndays"));
log.info("Document earler than "+ndays+" day(s)");
}
proxies.add(Proxy.NO_PROXY);
DefaultProxyController proxyController = new DefaultProxyController(proxies);
DefaultDownloader downloader = new DefaultDownloader();
downloader.setAllowedContentTypes(new String[]{"text/html", "text/plain"});
downloader.setMaxContentLength(100000);
downloader.setTriesCount(3);
downloader.setProxyController(proxyController);
DefaultDownloaderController downloaderController = new DefaultDownloaderController();
downloaderController.setGenericDownloader(downloader);
DefaultParserController defaultParserController = new DefaultParserController();
defaultParserController.setGenericParser(DefaultParser.class);
CrawlerConfiguration configuration = new CrawlerConfiguration();
configuration.setMaxHttpErrors(HttpURLConnection.HTTP_BAD_GATEWAY, 10);
configuration.setMaxLevel(3);
configuration.setMaxParallelRequests(5);
configuration.setPolitenessPeriod(500);
try {
// PersistentListener listener = new PersistentListener(words,ndays);
for (int i = 0; i < ncrawlers; i++) {
PersistentCrawler crawler = new PersistentCrawler();
crawler.setDownloaderController(downloaderController);
crawler.setParserController(defaultParserController);
configuration.addCrawler(crawler);
}
crawlerController = new CrawlerController(configuration);
Queue q = Infinispan.getOrCreateQueue("queue");
log.info(q.size());
crawlerController.setQueue(q);
if(!seed.equals("") && q.size()==0 ) crawlerController.addSeed(new URL(seed));
crawlerController.start();
crawlerController.join();
} catch (Exception e) {
e.printStackTrace();
}
Infinispan.stop();
System.out.println("Terminated.");
while (isRunning()) ;
}
@Override
public void onMessage(Message message) {
}
@Override
public void triggerShutdown() {
super.triggerShutdown();
try {
com.disable();
crawlerController.stop();
} catch (Exception e) {
e.printStackTrace();
}
}
}