package com.cse10.crawler.paperCrawler;
import com.cse10.article.Article;
import com.cse10.crawler.contentHandler.BasicContentHandler;
import com.cse10.crawler.contentHandler.NewYorkTimesContentHandler;
import com.cse10.crawler.crawlControler.NewYorkTimesCrawlController;
import com.cse10.database.DatabaseHandler;
import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.url.WebURL;
import java.util.List;
/**
* Created by TharinduWijewardane on 2015-02-07.
*/
public class NewYorkTimesCrawler extends BasicCrawler {
private BasicContentHandler basicContentHandler;
/**
* You should implement this function to specify whether the given url
* should be crawled or not (based on your crawling logic).
*/
@Override
public boolean shouldVisit(WebURL url) {
String href = url.getURL().toLowerCase();
url.getURL();
boolean shouldVisit = super.shouldVisit(url) && href.startsWith("http://www.nytimes.com/" + NewYorkTimesCrawlController.currentYearMonth);
if (shouldVisit){
url.setURL(url.getURL().replaceAll("www.nytimes.com/","localhost/cookieHandler/newyorkTimes.php?path="));
return true;
}
return false;
}
/**
* This function is called when a page is fetched and ready to be processed
* by your program.
*/
@Override
public void visit(Page page) {
super.visit(page);
logger.info("=============");
logger.info("********* inside if NewYork Times ***********");
basicContentHandler = new NewYorkTimesContentHandler();
List<Article> articles = basicContentHandler.extractArticles(page);
for (Article article : articles) {
logger.info("***********************************start");
logger.info(article.getContent());
if (!article.getContent().equals(""))
DatabaseHandler.insertArticle(article);
logger.info("***********************************end");
}
}
}