package com.cse10.crawler.crawlControler; /** * Created by Sampath Liyanage on 13.07.2014. */ import com.cse10.util.GlobalConstants; import edu.uci.ics.crawler4j.crawler.CrawlConfig; import edu.uci.ics.crawler4j.crawler.CrawlController; import edu.uci.ics.crawler4j.crawler.WebCrawler; import org.apache.log4j.Logger; import java.text.SimpleDateFormat; import java.util.Calendar; import java.util.Date; import java.util.Observable; abstract public class BasicCrawlController extends Observable { protected Logger logger = Logger.getLogger(this.getClass()); private CrawlConfig config; //global configurations final String CRAWL_STORAGE_DIR_ROOT = "E:/CrawlData"; final String PROXY_ADDRESS = GlobalConstants.PROXY_ADDRESS; final int PROXY_PORT = GlobalConstants.PROXY_PORT; // start and end dates used by sub classes protected String startDate; protected String endDate; protected CrawlController controller; //to be used in subclasses protected boolean crawlingStopped = false; /** * @param startDate format: yyyy-MM-dd */ public void setStartDate(String startDate) { this.startDate = startDate; } public void setStartDate(Date startDate) { Calendar c = Calendar.getInstance(); c.setTime(startDate); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); this.startDate = sdf.format(c.getTime()); } /** * @param endDate format: yyyy-MM-dd */ public void setEndDate(String endDate) { this.endDate = endDate; } public void setEndDate(Date endDate) { Calendar c = Calendar.getInstance(); c.setTime(endDate); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); this.endDate = sdf.format(c.getTime()); } public BasicCrawlController() { config = new CrawlConfig(); //***********setting default configurations********* /* * crawlStorageFolder is a folder where intermediate crawl data is * stored. */ String crawlStorageFolder = CRAWL_STORAGE_DIR_ROOT + "/" + this.getClass().getSimpleName(); /* * numberOfCrawlers shows the number of concurrent threads that should * be initiated for crawling. */ int numberOfCrawlers = 4; //Integer.parseInt(args[1]); config.setCrawlStorageFolder(crawlStorageFolder); /* * Be polite: Make sure that we don't send more than 1 request per * second (1000 milliseconds between requests). */ config.setPolitenessDelay(1000); /* * You can set the maximum crawl depth here. The default value is -1 for * unlimited depth */ config.setMaxDepthOfCrawling(1); /* * You can set the maximum number of pages to crawl. The default value * is -1 for unlimited number of pages */ config.setMaxPagesToFetch(-1); /* * Do you need to set a proxy? If so, you can use: */ if (!PROXY_ADDRESS.isEmpty() && PROXY_PORT != 0) { config.setProxyHost(PROXY_ADDRESS); config.setProxyPort(PROXY_PORT); } /* * If your proxy also needs authentication: * config.setProxyUsername(username); config.getProxyPassword(password); * --------Isn't it proxy.setProxyPassword(password) ?--------- */ /* * This config parameter can be used to set your crawl to be resumable * (meaning that you can resume the crawl from a previously * interrupted/crashed crawl). Note: if you enable resuming feature and * want to start a fresh crawl, you need to delete the contents of * rootFolder manually. */ config.setResumableCrawling(false); //*********setting customized configurations************* configure(config); logger.info(config.toString()); // print config } public CrawlConfig getConfig() { return config; } //extend if default configurations should be customized protected CrawlConfig configure(CrawlConfig crawlConfig) { return config; } abstract public <T extends WebCrawler> void crawl(final Class<T> _c) throws Exception; public void stopCrawl() { crawlingStopped = true; if (controller != null) { controller.shutdown(); } } }