package com.cse10.crawler.crawlControler; /** * Created by Sampath on 13.07.2014 */ import com.cse10.crawler.DateHandler; import edu.uci.ics.crawler4j.crawler.CrawlController; import edu.uci.ics.crawler4j.crawler.WebCrawler; import edu.uci.ics.crawler4j.fetcher.PageFetcher; import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig; import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer; import java.text.SimpleDateFormat; import java.util.Calendar; import java.util.Date; public class TheIslandCrawlController extends BasicCrawlController { /* * Web site of the Island news paper uses cookies to show n ews articles * Crawler4j doesn't handle cookies * As a solution a php proxy to handle cookies were implemented (https://github.com/sampathLiyanage/phpCurlCookies.git) * That php app acts as a middleware between crawler4j and the website * put the theIsland directory inside www folder and update the address of localhostProxyUrl below accordingly * Make sure the web server runs, and you can access the localhostProxyUrl from the browser before crawling * */ final String localhostProxyUrl = "http://localhost/cookieHandler/theIsland.php"; public static String current_date; public <T extends WebCrawler> void crawl(final Class<T> _c) throws Exception { if (startDate == null || endDate == null) { logger.info("Error: You should set start and end dates"); return; } SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); Date startingDate = sdf.parse(startDate); startingDate = DateHandler.getFromDateToResume(startingDate, "article_the_island"); // Start date Calendar c = Calendar.getInstance(); c.setTime(startingDate); while (c.getTime().compareTo(sdf.parse(endDate)) <= 0) { /* * Instantiate the controller for this crawl. */ PageFetcher pageFetcher = new PageFetcher(getConfig()); RobotstxtConfig robotstxtConfig = new RobotstxtConfig(); RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher); controller = new CrawlController(getConfig(), pageFetcher, robotstxtServer); /* * For each crawl, you need to add some seed urls. These are the first * URLs that are fetched and then the crawler starts following links * which are found in these pages */ int year = c.get(Calendar.YEAR); int month = c.get(Calendar.MONTH) + 1; //java defines january as 0 int date = c.get(Calendar.DATE); current_date = sdf.format(c.getTime()); String url = localhostProxyUrl + "?newsfordate=" + date + "/" + month + "/" + year; controller.addSeed(url); logger.info("crawling " + url); /* * Start the crawl. This is a blocking operation, meaning that your code * will reach the line after this only when crawling is finished. */ controller.start(_c, 1); if (crawlingStopped) { //if stopped from calling class return; } setChanged(); notifyObservers(sdf.format(c.getTime())); c.add(Calendar.DATE, 1); // number of days to add } } }