package com.cse10.crawler.contentHandler; import com.cse10.article.Article; import com.cse10.article.HiruNewsArticle; import com.cse10.crawler.crawlControler.HiruNewsCrawlController; import edu.uci.ics.crawler4j.crawler.Page; import edu.uci.ics.crawler4j.parser.HtmlParseData; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Calendar; import java.util.Date; import java.util.List; /** * Created by Tharindu on 2014-11-13. */ public class HiruNewsContentHandler extends BasicContentHandler { @Override public List extractArticles(Page page) { if (page.getParseData() instanceof HtmlParseData) { HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); String html = htmlParseData.getHtml(); Document doc = Jsoup.parseBodyFragment(html); Element articleElement = doc.getElementsByClass("left_txt").first(); if (articleElement == null) { // if no article can be found return articles; } Element titleElement; if (!articleElement.getElementsByClass("lft_txtpc").isEmpty()) { titleElement = articleElement.getElementsByClass("lft_txtpc").first(); } else { titleElement = articleElement.getElementsByClass("nws_tpc").first(); } String title = titleElement.ownText(); if (title != null && title.length() > 100) { title = title.substring(0, 100); } String dateString = titleElement.getElementsByClass("time").first().ownText(); Date date = null; try { date = new SimpleDateFormat("EEEE, d MMMM yyyy").parse(dateString); Calendar cal = Calendar.getInstance(); cal.setTime(date); if (cal.get(Calendar.YEAR) != HiruNewsCrawlController.cal.get(Calendar.YEAR) || cal.get(Calendar.MONTH) != HiruNewsCrawlController.cal.get(Calendar.MONTH)) { return articles; // if news does not belong to current month (ad news) } } catch (ParseException e) { System.out.println("Error ::: Date not available"); e.printStackTrace(); return articles; // if date cannot be extracted } String author = null; // no author String content; if (!articleElement.getElementsByClass("hnimage").isEmpty()) { content = articleElement.getElementsByClass("hnimage").first().ownText().trim(); } else { content = articleElement.getElementsByClass("lft_newscnt").first().ownText().trim(); } if (!filterArticles(content)) { return articles; // ignore the article if filter does not approve } Article article = new HiruNewsArticle(); article.setTitle(title); article.setCreatedDate(date); article.setAuthor(author); article.setContent(content); articles.add(article); } return articles; } }