package at.chille.crawler;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.util.Map;
import org.apache.http.conn.scheme.SchemeSocketFactory;
import org.apache.http.conn.ssl.SSLSocketFactory;
import org.apache.http.conn.ssl.TrustStrategy;
import org.apache.http.conn.ssl.X509HostnameVerifier;
import org.apache.log4j.Logger;
import at.chille.crawler.database.model.HostInfo;
import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.crawler.CrawlController;
import edu.uci.ics.crawler4j.fetcher.PageFetcher;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
/**
* @author chille
*
*/
public class HttpAnalysisCrawlController extends CrawlController {
public HttpAnalysisCrawlController(CrawlConfig config,
PageFetcher pageFetcher, RobotstxtServer robotstxtServer)
throws Exception {
super(config, pageFetcher, robotstxtServer);
}
protected static boolean resumable = true;
protected static int threads = 1;
protected static Logger logger = Logger
.getLogger(HttpAnalysisCrawlController.class);
@Override
protected void cronJob() {
super.cronJob();
// deprecated, (5000 hosts -> 35 seconds
// locking the database for this long time is extremely slow!
// store directly, if something is changed in HostInfo!
/*
* try { if (new Date().getTime() > lastStoreDatabase + storeInterval) {
* logger.info("Saving to database...");
* DatabaseManager.getInstance().saveSession();
* logger.info("Saved to database."); lastStoreDatabase = new
* Date().getTime(); } } catch (Exception ex) { ex.printStackTrace(); }
*/
}
public static void main(String[] args) throws Exception {
if (args.length != 2) {
System.out.println("Needed parameters: ");
System.out
.println("\t rootFolder (it will contain intermediate crawl data)");
System.out
.println("\t numberOfCralwers (number of concurrent threads)");
return;
}
BufferedReader console = new BufferedReader(new InputStreamReader(
System.in));
while (true) {
System.out
.println("Do you want to make crawling resumable/resume crawling? (y/yes/n/no)");
String command = console.readLine();
if (command.toLowerCase().equals("y")
|| command.toLowerCase().equals("yes")) {
resumable = true;
break;
}
if (command.toLowerCase().equals("n")
|| command.toLowerCase().equals("no")) {
resumable = false;
break;
}
}
System.out.println("Initializing Crawler Config...");
String crawlStorageFolder = args[0];
int numberOfCrawlers = Integer.parseInt(args[1]);
CrawlConfig config = new CrawlConfig();
threads = numberOfCrawlers;
config.setCrawlStorageFolder(crawlStorageFolder);
config.setPolitenessDelay(10); // do not use this for niceWaitTime
// see HttpAnalysisCrawler instead
config.setIncludeHttpsPages(true);
config.setFollowRedirects(true);
config.setConnectionTimeout(4000);
config.setSocketTimeout(10000);
config.setMaxConnectionsPerHost(10);
config.setMaxTotalConnections(1000);
config.setUserAgentString("Crawler for Research Purposes; still under development; based on crawler4j (http://code.google.com/p/crawler4j/)");
config.setMaxDepthOfCrawling(-1);
config.setResumableCrawling(resumable);
config.setMaxPagesToFetch(-1);
// set to -1, +1 is just for testing a single page
// Try to initialize Database
logger.info("Initialize Database...");
try {
DatabaseManager.getInstance();
if (resumable) {
logger.info("Loading last Crawling Session...");
DatabaseManager.getInstance().loadLastCrawlingSession();
}
if (DatabaseManager.getInstance().getCurrentCrawlingSession() == null) {
logger.info("Generate New Crawling Session...");
DatabaseManager.getInstance().setNewCrawlingSession(
"Crawling Testing");
}
DatabaseManager.getInstance().saveSession();
// DatabaseManager.getInstance().tryAddingSomething();
} catch (Exception ex) {
ex.printStackTrace();
throw ex;
}
// Hint: Exit here to test database schema only
// System.exit(0);
logger.info("Setting up TrustStrategy and HostnameVerifier to catch the HTTPS Details...");
try {
TrustStrategy ts = new AllowAllTrustStrategy();
X509HostnameVerifier hv = new AllAllowHostNameVerifier();
SchemeSocketFactory httpsSocketFactory = new SSLSocketFactory(ts,
hv);
config.setHttpsSocketFactory(httpsSocketFactory);
} catch (Exception ex) {
System.err.println(ex.toString());
}
System.out.println("Crawling configuration:");
System.out.println(config);
// Instantiate the controller for this crawl.
logger.info("Init crawler...");
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig,
pageFetcher);
CrawlController controller = new HttpAnalysisCrawlController(config,
pageFetcher, robotstxtServer);
logger.info("Adding Seeds...");
for (String seed : StringFileReader.readLines("seeds.txt")) {
controller.addSeed(seed);
logger.info("Adding Seed: "+ seed);
}
// blocking operation:
logger.info("Starting Crawler...");
// controller.start(HttpAnalysisCrawler.class, numberOfCrawlers);
controller
.startNonBlocking(HttpAnalysisCrawler.class, numberOfCrawlers);
while (true) {
System.err
.println("Enter: 'abort' to exit process or 'status' for status: ");
String command = console.readLine();
if (command.toLowerCase().equals("abort")) {
break;
}
if (command.toLowerCase().equals("status")) {
try {
System.err.println("Queue Length: "
+ controller.getFrontier().getQueueLength());
System.err.println("Processed Pages: "
+ controller.getFrontier()
.getNumberOfProcessedPages());
// System.err.println("Assigned Pages: "
// + controller.getFrontier()
// .getNumberOfAssignedPages());
} catch (Exception ex) {
ex.printStackTrace();
}
}
}
controller.shutdown();
controller.waitUntilFinish();
// end of nonblocking version of crawler.
System.out.println("\n\n-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-");
System.out.println("Finally storing the results to the database...");
DatabaseManager.getInstance().saveSession();
System.out.println("Done: You can abort this process.");
// Final output of crawler *IF* it finished:
System.out.println("\n\n");
Map<String, HostInfo> visitedHosts = DatabaseManager.getInstance()
.getCurrentCrawlingSession().getHosts();
System.out.println("Size of visited hosts: " + visitedHosts.size());
/*
* Set<Map.Entry<String, HostInfo>> set = visitedHosts.entrySet(); for
* (Map.Entry<String, HostInfo> host : set) { System.out.println(" " +
* host.getKey() + " (" + host.getValue().getPages().size() + ")"); } //
*/
// System.out.println("\n\n");
// System.out.println(CertificateLogger.getInstance());
}
}