package com.cse10.crawler.contentHandler;
import com.cse10.article.Article;
import com.cse10.article.NewYorkTimesArticle;
import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.parser.HtmlParseData;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.List;
/**
* Created by TharinduWijewardane on 2015-02-07.
*/
public class NewYorkTimesContentHandler extends BasicContentHandler {
@Override
public List extractArticles(Page page) {
if (page.getParseData() instanceof HtmlParseData) {
System.out.println("Current URL: " + page.getWebURL());
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
String html = htmlParseData.getHtml();
Document doc = Jsoup.parseBodyFragment(html);
Element articleElement = doc.getElementById("story");
if (articleElement == null) { // if no article can be found
return articles;
}
String title = articleElement.getElementById("story-heading").ownText();
String dateString = articleElement.getElementsByClass("dateline").first().attr("datetime");
Date date = null;
try {
date = new SimpleDateFormat("yyyy-MM-dd").parse(dateString);
} catch (ParseException e) {
e.printStackTrace();
}
String author = articleElement.getElementsByClass("byline-author").first().ownText();
String content = "";
Elements contentElements = articleElement.select("p.story-body-text.story-content");
for (Element contentElement : contentElements) {
content += contentElement.ownText();
}
if (!filterArticles(content)) {
return articles; // ignore the article if filter does not approve
}
Article article = new NewYorkTimesArticle();
article.setTitle(title);
article.setCreatedDate(date);
article.setAuthor(author);
article.setContent(content);
articles.add(article);
}
return articles;
}
}