/*
* Copyright 2011 Marek Pilecky
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.github.mefi.jkuuza.crawler;
import com.github.mefi.jkuuza.app.db.DbConnector;
import com.github.mefi.jkuuza.crawler.gui.CrawlerConsole;
import java.io.IOException;
import java.util.List;
import org.niocchi.core.Crawler;
import org.niocchi.core.ResourceException;
import org.niocchi.core.URLPoolException;
import org.niocchi.core.Worker;
/**
*
* @author Marek Pilecky
*/
public class SimpleCrawler {
int resourcesCount = 30; // number of url that can be crawled simultaneously
String userAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.696.68 Safari/534.24";
DbConnector connector;
TimeoutURLPool urlPool;
Crawler crawler;
Worker worker;
public SimpleCrawler(DbConnector connector) {
this.connector = connector;
}
/**
* Creates crawler and sets its dependencies
*
* @param list List of domains to crawle
* @throws IOException
*/
private void init(List list) throws IOException {
// create the worker
crawler = new Crawler(new HTMLResourceFactory(), resourcesCount);
crawler.setUserAgent(userAgent);
// create the url pool
urlPool = new TimeoutURLPool(new ExpandableURLPool(list));
// create the worker
worker = new DbSaveWorker(crawler, urlPool, connector);
// print info
CrawlerConsole.print("Crawler initialized.", true);
}
public void crawl(List list) throws IOException, InterruptedException, ResourceException, URLPoolException {
this.init(list);
// start workers
worker.start();
CrawlerConsole.print("Crawler started.", true);
CrawlerConsole.printNewLine();
// start crawler
crawler.run(urlPool);
// wait for workers to finish
worker.join();
CrawlerConsole.printNewLine();
CrawlerConsole.print("Crawler finished.", true);
CrawlerConsole.print("Doba crawlování: " + this.crawler.select_total_time/60 + "vteřin");
CrawlerConsole.print(this.crawler.processed_count + " URL processed");
CrawlerConsole.print(this.crawler.status_200 + " with status 200");
CrawlerConsole.print(this.crawler.redirected_count + " redirections");
CrawlerConsole.print(this.crawler.status_other + " other status");
CrawlerConsole.print(this.crawler.incomplete_count + " incomplete");
}
public void execute(List list) throws Exception {
this.crawl(list);
}
}