package com.lgq.rssreader.readability; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.regex.Pattern; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import org.jsoup.nodes.TextNode; import org.jsoup.select.Elements; public class Readability { static Map<String, Integer> tagMap = new HashMap<String, Integer>(); static Map<String, Integer> classMap = new HashMap<String, Integer>(); static{ tagMap = new HashMap<String, Integer>(); tagMap.put("div", 1); tagMap.put("pre", 2); tagMap.put("td", 3); tagMap.put("blockquote", 4); tagMap.put("address", 5); tagMap.put("ol", 6); tagMap.put("ul", 7); tagMap.put("dl", 8); tagMap.put("dd", 9); tagMap.put("dt", 10); tagMap.put("li", 11); tagMap.put("form", 12); tagMap.put("h1", 13); tagMap.put("h2", 14); tagMap.put("h3", 15); tagMap.put("h4", 16); tagMap.put("h5", 17); tagMap.put("h6", 18); tagMap.put("th", 19); classMap = new HashMap<String, Integer>(); tagMap.put("content",1); tagMap.put("article", 2); tagMap.put("main",3); tagMap.put("body",4); tagMap.put("introduction",5); tagMap.put("shadow",6); tagMap.put("and",7); tagMap.put("column",8); } public static Readability Create(String documentHtml) { return new Readability(documentHtml); } private Readability(String documentHtml) { Document doc = Jsoup.parse(documentHtml); //TagNameToLowerCase(doc); RemoveScripts(doc); //this.Title = GetArticleTitle(doc); this.Content = GetArticleContent(doc); } //public String Title; public String Content; // private static void TagNameToLowerCase(Element node) // { // node. = node.nodeName().ToLower(); // // for(Element child : node.children()) // { // TagNameToLowerCase(child); // } // } private static void RemoveScripts(Element node) { for(Element script : node.getElementsByTag("script")) { script.remove(); } } private static String GetInnerText(Element node) { return node.html(); } private static Pattern s_unlikelyCandidates = Pattern.compile("combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|rating_box|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter|copyright", Pattern.CASE_INSENSITIVE); private static Pattern s_okMaybeItsACandidate = Pattern.compile("and|article|body|column|main|shadow|content|introduction", Pattern.CASE_INSENSITIVE); private static Pattern s_divToPElements = Pattern.compile("<(a|blockquote|dl|div|img|ol|p|pre|table|ul)", Pattern.CASE_INSENSITIVE); private static double GetLinkDensity(Element node) { List<Element> links = node.getElementsByTag("a"); int textLength = GetInnerText(node).length(); int linkLength = 0; for(Element l : links){ linkLength = linkLength + GetInnerText(l).length(); } return linkLength * 1.0 / textLength; } private static int switchTagStr(String key) { if(tagMap.containsKey(key)) return tagMap.get(key); return 0; } private static int switchClassStr(String key) { if(classMap.containsKey(key)) return classMap.get(key); return 0; } private static int CalculateNodeScore(Element node) { int score = 0; switch(switchTagStr(node.nodeName())) { case 1: score += 5; break; case 2: case 3: case 4: score += 3; break; case 5: case 6: case 7: case 8: case 9: case 10: case 11: case 12: score -= 3; break; case 13: case 14: case 15: case 16: case 17: case 18: case 19: score -= 5; break; } return score + GetClassWeight(node); } private static int GetClassWeight(Element node) { if (!node.hasAttr("class")) return 0; int score = 0; switch(switchClassStr(node.attr("class").toLowerCase())) { //"and|article|body|column|main|shadow|content" case 1: case 2: case 3: case 4: score += 10; break; case 5: score += 5; break; case 6: case 7: case 8: score += 3; break; } return score; } // private static String GetArticleTitle(Element htmlNode) // { // if (htmlNode.getElementsByTag("title") == null) // return null; // // Element titleNode = htmlNode.getElementsByTag("title").get(0); // // String currTitle, origTitle; // currTitle = origTitle = GetInnerText(titleNode); // // if (Regex.IsMatch(currTitle, @" [\|\-] ")) // { // currTitle = Regex.Replace(origTitle, @"(.*)[\|\-] .*", "$1"); // // if (currTitle.Split(' ').Length < 3) // { // currTitle = origTitle.Replace(@"[^\|\-]*[\|\-](.*)", "$1"); // } // } // else if (currTitle.IndexOf(": ") != -1) // { // currTitle = Regex.Replace(origTitle, @".*:(.*)", "$1"); // // if(currTitle.Split(' ').Length < 3) // { // currTitle = Regex.Replace(origTitle, @"[^:]*[:](.*)", "$1"); // } // } // else if (currTitle.Length > 150 || currTitle.Length < 15) // { // var hOnes = htmlNode.GetElementsByTagName("h1"); // if (hOnes.Count == 1) // { // currTitle = GetInnerText(hOnes[0]); // } // } // // if (currTitle.Split(' ').Length <= 4) // { // currTitle = origTitle; // } // // return currTitle.Trim(); // } private static String GetArticleContent(Document doc) { Element body = doc.body(); List<Element> allElements = body.getAllElements(); List<Element> nodesToScore = new ArrayList<Element>(); for (int nodeIndex = 0, len = allElements.size(); nodeIndex < len; nodeIndex++){ Element node = allElements.get(nodeIndex); String unlikelyMatchString = node.hasAttr("class")? node.attr("class"): "" + node.attr("id"); if (s_unlikelyCandidates.matcher(unlikelyMatchString).find() && !s_okMaybeItsACandidate.matcher(unlikelyMatchString).find() && !node.nodeName().equals("body") && !node.nodeName().equals("html") && !node.nodeName().equals("head")) { node.remove(); continue; } if (node.nodeName().equals("p") || node.nodeName().equals("td") || node.nodeName().equals("pre")) { nodesToScore.add(node); } if (node.nodeName().equals("div")) { if (!s_divToPElements.matcher(node.html()).find()) { if(node.ownerDocument() != null){ Element newNode = node.ownerDocument().createElement("p"); newNode.html(node.html()); node.replaceWith(newNode); nodesToScore.add(newNode); } } else { for(Node childNode : node.childNodes()) { if (childNode instanceof TextNode) { if(node.ownerDocument() != null){ Element p = node.ownerDocument().createElement("p"); p.html(((TextNode) childNode).text()); childNode.replaceWith(p); } } } } } } Map<Element, Integer> scores = new HashMap<Element, Integer>(); List<Element> candidates = new ArrayList<Element>(); for (int pt = 0, len = nodesToScore.size(); pt < len; pt++) { Element parentNode = nodesToScore.get(pt).parent(); Element grandParentNode = parentNode != null ? parentNode.parent() : null; String innerText = GetInnerText(nodesToScore.get(pt)); if (parentNode == null) continue; if (parentNode.nodeName().equals("body")) continue; if (parentNode.nodeName().equals("html")) continue; if (parentNode.nodeName().equals("footer")) continue; if (parentNode != null && parentNode.hasAttr("class") && parentNode.attr("class").equals("copyright")) continue; if (innerText.length() < 25) continue; if (!scores.containsKey(parentNode)) { scores.put(parentNode, CalculateNodeScore(parentNode)); candidates.add(parentNode); } if (grandParentNode != null && !scores.containsKey(grandParentNode)) { scores.put(grandParentNode, CalculateNodeScore(grandParentNode)); candidates.add(grandParentNode); } int contentScore = 0; contentScore++; //for embed flash case if(innerText.contains("embed") && ( innerText.contains("youku") || innerText.contains("tudou") || innerText.contains("ku6") || innerText.contains("sohu") || innerText.contains("weiphone") || innerText.contains("56") || innerText.contains("youtube") || innerText.contains("qq") )) contentScore += 50; contentScore += innerText.split("[,]|[\uFF0C]").length; contentScore += Math.min(innerText.length() / 100, 3); int v = scores.get(parentNode); v += contentScore; scores.put(parentNode, v); if (grandParentNode != null) { v = scores.get(grandParentNode); v += contentScore / 2; scores.put(grandParentNode, v); } } Element topCandidate = null; for(Element cand : candidates){ int v = scores.get(cand); v = (int)(v * (1 - GetLinkDensity(cand))); scores.put(cand, v); if (topCandidate == null || scores.get(cand) > scores.get(topCandidate)) { topCandidate = cand; } if (topCandidate == null || topCandidate.nodeName().equals("body")) { topCandidate = doc.createElement("div"); topCandidate.html(body.html()); body.html(""); body.appendChild(topCandidate); scores.put(topCandidate, CalculateNodeScore(topCandidate)); } } return topCandidate == null ? null : topCandidate.html(); } }