package com.cse10.crawler.contentHandler;
import com.cse10.article.Article;
import com.cse10.article.CeylonTodayArticle;
import com.cse10.crawler.crawlControler.CeylonTodayCrawlController;
import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.parser.HtmlParseData;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.List;
/**
* Created by Sampath Liyanage on 17.07.2014.
*/
public class CeylonTodayContentHandler extends BasicContentHandler {
@Override
public List extractArticles(Page page) {
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
String html = htmlParseData.getHtml();
Document doc = Jsoup.parseBodyFragment(html);
doc.getElementsByClass("breakings").remove();
Elements articleElements = doc.select("p");
for (Element articleElement : articleElements) {
String title = articleElement.getElementsByClass("newsdetailssubtitle").remove().text();
String content = articleElement.text();
content = content.replaceFirst("^By.*\\s\\s", "");
if (!filterArticles(content)) {
continue; // ignore the article if filter does not approve
}
Article article = new CeylonTodayArticle();
article.setTitle(title);
article.setContent(content);
String sentences[] = content.split("\\.");
if (sentences[0].matches("^By.*")) {
String author = sentences[0].replace("By", "");
author = author.replace("\u00a0", "");
String authorData[] = author.split(" +");
author = authorData[0];
author = author.trim();
article.setAuthor(author);
content = content.replaceFirst("^By.*" + author, "");
}
content = content.replace("\u00a0", "");
content = content.trim();
article.setContent(content);
DateFormat df = new SimpleDateFormat("yyyy-MM-dd");
try {
article.setCreatedDate(df.parse(CeylonTodayCrawlController.current_date));
} catch (ParseException e) {
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
}
articles.add(article);
}
}
return articles;
}
}