package com.cse10.crawler.contentHandler; import com.cse10.article.Article; import com.cse10.article.TheIslandArticle; import com.cse10.crawler.crawlControler.TheIslandCrawlController; import edu.uci.ics.crawler4j.crawler.Page; import edu.uci.ics.crawler4j.parser.HtmlParseData; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.text.DateFormat; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Created by TharinduWijewardane on 10.07.2014. */ public class TheIslandContentHandler extends BasicContentHandler { @Override public List extractArticles(Page page) { if (page.getParseData() instanceof HtmlParseData) { HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); String html = htmlParseData.getHtml(); Document doc = Jsoup.parseBodyFragment(html); Element articleElement; articleElement = doc.getElementById("left_video_area"); articleElement.getElementsByTag("div").remove(); Elements els = articleElement.getElementsByTag("h1"); Element el = els.first(); String title = el.text(); articleElement.getElementsByTag("h1").remove(); articleElement.getElementsByClass("article_date").remove(); String content = articleElement.text(); if (content.length() < 150 || title.equals("News")){ return null; } if (filterArticles(content)) { // Article article = new TheIslandArticle(); article.setTitle(title); String author = null; els = articleElement.getElementsByTag("p"); article.setContent(articleElement.text()); for (Element ele : els) { if (ele.text().toLowerCase().startsWith("by") && ele.text().length() < 100){ author = ele.text(); ele.remove(); author = author.replaceFirst("(By|by|BY)\\s",""); article.setContent(articleElement.text()); } } if (author == null && !articleElement.ownText().trim().equals("")){ author = articleElement.ownText(); String cont = articleElement.text(); article.setContent(cont.replaceFirst(author, "")); author = author.replaceFirst("(By|by|BY)\\s",""); } if (author == null) { String text = articleElement.text(); Pattern pattern = Pattern.compile("^(By|by|BY)\\s([A-Z][^\\s]*\\s)+"); Matcher matcher = pattern.matcher(text); if (matcher.find()) { author = matcher.group().trim(); author = author.substring(0, author.lastIndexOf(" ")); text = text.replaceFirst(author, ""); article.setContent(text); author = author.replaceFirst("(By|by|BY)\\s", ""); article.setAuthor(author); } else { article.setContent(text); } } article.setAuthor(author); DateFormat df = new SimpleDateFormat("yyyy-MM-dd"); try { article.setCreatedDate(df.parse(TheIslandCrawlController.current_date)); } catch (ParseException e) { e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. } articles.add(article); return articles; } } return null; } }