package com.cse10.crawler.contentHandler;
import com.cse10.article.Article;
import com.cse10.article.DailyMirrorArticle;
import com.cse10.crawler.crawlControler.DailyMirrorCrawlController;
import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.parser.HtmlParseData;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.List;
/**
* Created by TharinduWijewardane on 10.07.2014.
*/
public class DailyMirrorContentHandler extends BasicContentHandler {
@Override
public List extractArticles(Page page) {
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
String html = htmlParseData.getHtml();
Document doc = Jsoup.parseBodyFragment(html);
Elements articleElements = doc.getElementsByClass("article-content");
for (Element articleElement : articleElements) {
String content = articleElement.text();
if (!filterArticles(content)) {
continue; // ignore the article if filter does not approve
}
Article article = new DailyMirrorArticle();
article.setContent(content);
String title = page.getWebURL().getPath().replaceAll("/.*/", "");
title = title.replaceAll(".html", "");
title = title.replaceAll("^[^a-zA-Z]+", "");
title = title.replaceAll("-", " ");
article.setTitle(title);
DateFormat df = new SimpleDateFormat("yyyy-MM-dd");
try {
article.setCreatedDate(df.parse(DailyMirrorCrawlController.current_date));
} catch (ParseException e) {
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
}
articles.add(article);
}
}
return articles;
}
}