package com.cse10.crawler.contentHandler;
import com.cse10.article.Article;
import com.cse10.article.NewsFirstArticle;
import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.parser.HtmlParseData;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.List;
/**
* Created by TharinduWijewardane on 17.07.2014.
*/
public class NewsFirstContentHandler extends BasicContentHandler {
@Override
public List extractArticles(Page page) {
if (page.getParseData() instanceof HtmlParseData) {
String postId = "post-" + page.getWebURL().getPath().replaceAll(".*/", "");
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
String html = htmlParseData.getHtml();
Document doc = Jsoup.parseBodyFragment(html);
Element articleElement = doc.getElementById(postId);
if (articleElement == null) { // if no article can be found
return articles;
}
String title = articleElement.getElementsByClass("post-title").first().ownText();
String dateString = articleElement.getElementsByClass("date").first().ownText();
Date date = null;
try {
date = new SimpleDateFormat("MMMM d, yyyy").parse(dateString);
} catch (ParseException e) {
e.printStackTrace();
}
String author = articleElement.getElementsByAttributeValue("rel", "author").first().ownText();
String content = "";
Elements contentElements = articleElement.select("p:not(.post-meta)");
for (Element contentElement : contentElements) {
content += contentElement.ownText();
}
if (!filterArticles(content)) {
return articles; // ignore the article if filter does not approve
}
Article article = new NewsFirstArticle();
article.setTitle(title);
article.setCreatedDate(date);
article.setAuthor(author);
article.setContent(content);
articles.add(article);
}
return articles;
}
}