package de.jetwick.snacktory; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.Date; import java.util.LinkedHashMap; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.regex.Pattern; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * This class is thread safe. * * @author Alex P (ifesdjeen from jreadability) * @author Peter Karich */ public class ArticleTextExtractor { private static final Logger logger = LoggerFactory.getLogger(ArticleTextExtractor.class); // Interessting nodes private static final Pattern NODES = Pattern.compile("p|div|td|h1|h2|article|section"); // Unlikely candidates private String unlikelyStr; private Pattern UNLIKELY; // Most likely positive candidates private String positiveStr; private Pattern POSITIVE; // Most likely negative candidates private String negativeStr; private Pattern NEGATIVE; private static final Pattern NEGATIVE_STYLE = Pattern.compile("hidden|display: ?none|font-size: ?small"); private static final Pattern IGNORE_AUTHOR_PARTS = Pattern.compile("by|name|author|posted|twitter|handle|news", Pattern.CASE_INSENSITIVE); private static final Set<String> IGNORED_TITLE_PARTS = new LinkedHashSet<String>() { { add("hacker news"); add("facebook"); } }; private static final OutputFormatter DEFAULT_FORMATTER = new OutputFormatter(); private OutputFormatter formatter = DEFAULT_FORMATTER; private static final int MIN_AUTHOR_NAME_LENGTH = 4; public ArticleTextExtractor() { setUnlikely("com(bx|ment|munity)|dis(qus|cuss)|e(xtra|[-]?mail)|foot|" + "header|menu|re(mark|ply)|rss|sh(are|outbox)|sponsor" + "a(d|ll|gegate|rchive|ttachment)|(pag(er|ination))|popup|print|" + "login|si(debar|gn|ngle)"); setPositive("(^(body|content|h?entry|main|page|post|text|blog|story|haupt))" + "|arti(cle|kel)|instapaper_body"); setNegative("nav($|igation)|user|com(ment|bx)|(^com-)|contact|" + "foot|masthead|(me(dia|ta))|outbrain|promo|related|scroll|(sho(utbox|pping))|" + "sidebar|sponsor|tags|tool|widget|player|disclaimer|toc|infobox|vcard"); } public ArticleTextExtractor setUnlikely(String unlikelyStr) { this.unlikelyStr = unlikelyStr; UNLIKELY = Pattern.compile(unlikelyStr); return this; } public ArticleTextExtractor addUnlikely(String unlikelyMatches) { return setUnlikely(unlikelyStr + "|" + unlikelyMatches); } public ArticleTextExtractor setPositive(String positiveStr) { this.positiveStr = positiveStr; POSITIVE = Pattern.compile(positiveStr); return this; } public ArticleTextExtractor addPositive(String pos) { return setPositive(positiveStr + "|" + pos); } public ArticleTextExtractor setNegative(String negativeStr) { this.negativeStr = negativeStr; NEGATIVE = Pattern.compile(negativeStr); return this; } public ArticleTextExtractor addNegative(String neg) { setNegative(negativeStr + "|" + neg); return this; } public void setOutputFormatter(OutputFormatter formatter) { this.formatter = formatter; } // Returns the best node match based on the weights private Element getBestMatchElement(Collection<Element> nodes){ int maxWeight = -200; Element bestMatchElement = null; for (Element entry : nodes) { int currentWeight = getWeight(entry, false); if (currentWeight > maxWeight) { maxWeight = currentWeight; bestMatchElement = entry; if (maxWeight > 200) break; } } return bestMatchElement; } /** * @param html extracts article text from given html string. wasn't tested * with improper HTML, although jSoup should be able to handle minor stuff. * @returns extracted article, all HTML tags stripped */ public JResult extractContent(Document doc) throws Exception { return extractContent(new JResult(), doc, formatter); } public JResult extractContent(Document doc, OutputFormatter formatter) throws Exception { return extractContent(new JResult(), doc, formatter); } public JResult extractContent(String html) throws Exception { return extractContent(new JResult(), html); } public JResult extractContent(JResult res, String html) throws Exception { return extractContent(res, html, formatter); } public JResult extractContent(JResult res, String html, OutputFormatter formatter) throws Exception { if (html.isEmpty()) throw new IllegalArgumentException("html string is empty!?"); // http://jsoup.org/cookbook/extracting-data/selector-syntax return extractContent(res, Jsoup.parse(html), formatter); } public JResult extractContent(JResult res, Document doc, OutputFormatter formatter) throws Exception { if (doc == null) throw new NullPointerException("missing document"); res.setTitle(extractTitle(doc)); res.setDescription(extractDescription(doc)); res.setCanonicalUrl(extractCanonicalUrl(doc)); res.setAuthorName(extractAuthorName(doc)); res.setAuthorDescription(extractAuthorDescription(doc, res.getAuthorName())); // get date from document, if not present, extract from URL if possible Date docdate = extractDate(doc); if (docdate == null) { String dateStr = SHelper.estimateDate(res.getUrl()); docdate = parseDate(dateStr); res.setDate(docdate); } else { res.setDate(docdate); } // now remove the clutter prepareDocument(doc); // init elements Collection<Element> nodes = getNodes(doc); Element bestMatchElement = getBestMatchElement(nodes); if (bestMatchElement != null) { List<ImageResult> images = new ArrayList<ImageResult>(); Element imgEl = determineImageSource(bestMatchElement, images); if (imgEl != null) { res.setImageUrl(SHelper.replaceSpaces(imgEl.attr("src"))); // TODO remove parent container of image if it is contained in bestMatchElement // to avoid image subtitles flooding in res.setImages(images); } // clean before grabbing text String text = formatter.getFormattedText(bestMatchElement); text = removeTitleFromText(text, res.getTitle()); // this fails for short facebook post and probably tweets: text.length() > res.getDescription().length() if (text.length() > res.getTitle().length()) { res.setText(text); String fullHtml = bestMatchElement.toString(); res.setHtml(fullHtml); Elements children = bestMatchElement.select("a[href]"); // a with href = link String linkstr = ""; Integer linkpos = 0; Integer lastlinkpos = 0; for (Element child : children) { linkstr = child.toString(); linkpos = fullHtml.indexOf(linkstr, lastlinkpos); res.addLink(child.attr("abs:href"), child.text(), linkpos); lastlinkpos = linkpos; } } res.setTextList(formatter.getTextList(bestMatchElement)); } if (res.getImageUrl().isEmpty()) { res.setImageUrl(extractImageUrl(doc)); } res.setRssUrl(extractRssUrl(doc)); res.setVideoUrl(extractVideoUrl(doc)); res.setFaviconUrl(extractFaviconUrl(doc)); res.setKeywords(extractKeywords(doc)); return res; } protected String extractTitle(Document doc) { String title = cleanTitle(doc.title()); if (title.isEmpty()) { title = SHelper.innerTrim(doc.select("head title").text()); if (title.isEmpty()) { title = SHelper.innerTrim(doc.select("head meta[name=title]").attr("content")); if (title.isEmpty()) { title = SHelper.innerTrim(doc.select("head meta[property=og:title]").attr("content")); if (title.isEmpty()) { title = SHelper.innerTrim(doc.select("head meta[name=twitter:title]").attr("content")); } } } } return title; } protected String extractCanonicalUrl(Document doc) { String url = SHelper.replaceSpaces(doc.select("head link[rel=canonical]").attr("href")); if (url.isEmpty()) { url = SHelper.replaceSpaces(doc.select("head meta[property=og:url]").attr("content")); if (url.isEmpty()) { url = SHelper.replaceSpaces(doc.select("head meta[name=twitter:url]").attr("content")); } } return url; } protected String extractDescription(Document doc) { String description = SHelper.innerTrim(doc.select("head meta[name=description]").attr("content")); if (description.isEmpty()) { description = SHelper.innerTrim(doc.select("head meta[property=og:description]").attr("content")); if (description.isEmpty()) { description = SHelper.innerTrim(doc.select("head meta[name=twitter:description]").attr("content")); } } return description; } // Returns the publication Date or null protected Date extractDate(Document doc) { String dateStr = ""; // try some locations that nytimes uses Element elem = doc.select("meta[name=ptime]").first(); if (elem != null) { dateStr = SHelper.innerTrim(elem.attr("content")); // elem.attr("extragravityscore", Integer.toString(100)); // System.out.println("date modified element " + elem.toString()); } if (dateStr == "") { dateStr = SHelper.innerTrim(doc.select("meta[name=utime]").attr("content")); } if (dateStr == "") { dateStr = SHelper.innerTrim(doc.select("meta[name=pdate]").attr("content")); } if (dateStr == "") { dateStr = SHelper.innerTrim(doc.select("meta[property=article:published]").attr("content")); } if (dateStr != "") { return parseDate(dateStr); } // taking this stuff directly from Juicer (and converted to Java) // opengraph (?) Elements elems = doc.select("meta[property=article:published_time]"); if (elems.size() > 0) { Element el = elems.get(0); if (el.hasAttr("content")) { dateStr = el.attr("content"); if (dateStr.endsWith("Z")) { dateStr = dateStr.substring(0, dateStr.length() - 1) + "GMT-00:00"; } else { dateStr = "%sGMT%s".format(dateStr.substring(0, dateStr.length() - 6), dateStr.substring(dateStr.length() - 6, dateStr.length())); } return parseDate(dateStr); } } // rnews elems = doc.select("meta[property=dateCreated], span[property=dateCreated]"); if (elems.size() > 0) { Element el = elems.get(0); if (el.hasAttr("content")) { dateStr = el.attr("content"); return parseDate(dateStr); } else { return parseDate(el.text()); } } // schema.org creativework elems = doc.select("meta[itemprop=datePublished], span[itemprop=datePublished]"); if (elems.size() > 0) { Element el = elems.get(0); if (el.hasAttr("content")) { dateStr = el.attr("content"); return parseDate(dateStr); } else if (el.hasAttr("value")) { dateStr = el.attr("value"); return parseDate(dateStr); } else { return parseDate(el.text()); } } // parsely page (?) /* skip conversion for now, seems highly specific and uses new lib elems = doc.select("meta[name=parsely-page]"); if (elems.size() > 0) { implicit val formats = net.liftweb.json.DefaultFormats Element el = elems.get(0); if(el.hasAttr("content")) { val json = parse(el.attr("content")) return DateUtils.parseDateStrictly((json \ "pub_date").extract[String], Array("yyyy-MM-dd'T'HH:mm:ssZ", "yyyy-MM-dd'T'HH:mm:ss'Z'", "yyyy-MM-dd'T'HH:mm:ssZZ", "yyyy-MM-dd'T'HH:mm:ssz")) } } */ // BBC elems = doc.select("meta[name=OriginalPublicationDate]"); if (elems.size() > 0) { Element el = elems.get(0); if (el.hasAttr("content")) { dateStr = el.attr("content"); return parseDate(dateStr); } } // wired elems = doc.select("meta[name=DisplayDate]"); if (elems.size() > 0) { Element el = elems.get(0); if (el.hasAttr("content")) { dateStr = el.attr("content"); return parseDate(dateStr); } } // wildcard elems = doc.select("meta[name*=date]"); if (elems.size() > 0) { Element el = elems.get(0); if (el.hasAttr("content")) { dateStr = el.attr("content"); return parseDate(dateStr); } } return null; } private Date parseDate(String dateStr) { String[] parsePatterns = { "yyyy-MM-dd'T'HH:mm:ssz", "yyyy-MM-dd HH:mm:ss", "yyyy/MM/dd HH:mm:ss", "yyyy-MM-dd HH:mm", "yyyy/MM/dd HH:mm", "yyyy-MM-dd", "yyyy/MM/dd", "MM/dd/yyyy HH:mm:ss", "MM-dd-yyyy HH:mm:ss", "MM/dd/yyyy HH:mm", "MM-dd-yyyy HH:mm", "MM/dd/yyyy", "MM-dd-yyyy", "EEE, MMM dd, yyyy", "MM/dd/yyyy hh:mm:ss a", "MM-dd-yyyy hh:mm:ss a", "MM/dd/yyyy hh:mm a", "MM-dd-yyyy hh:mm a", "yyyy-MM-dd hh:mm:ss a", "yyyy/MM/dd hh:mm:ss a ", "yyyy-MM-dd hh:mm a", "yyyy/MM/dd hh:mm ", "dd MMM yyyy", "dd MMMM yyyy", "yyyyMMddHHmm", "yyyyMMdd HHmm", "dd-MM-yyyy HH:mm:ss", "dd/MM/yyyy HH:mm:ss", "dd MMM yyyy HH:mm:ss", "dd MMMM yyyy HH:mm:ss", "dd-MM-yyyy HH:mm", "dd/MM/yyyy HH:mm", "dd MMM yyyy HH:mm", "dd MMMM yyyy HH:mm", "yyyyMMddHHmmss", "yyyyMMdd HHmmss", "yyyyMMdd" }; try { return Util.parseDateStrictly(dateStr, parsePatterns); } catch (Exception ex) { return null; } } private String cleanAuthorNameResult(String result) { result = SHelper.innerTrim(result); try { // If the result is a valid URL, it's not an author name, so ignore it. Was an issue with http://www.engadget.com/2014/04/29/freedompop-iphone/ new URL(result); return ""; } catch (MalformedURLException e) { return result; } } // Returns the author name or null protected String extractAuthorName(Document doc) { String authorName = ""; // first try the Google Author tag Element result = doc.select("body [rel*=author]").first(); if (result != null) { authorName = cleanAuthorNameResult(result.ownText()); } // if that doesn't work, try some other methods if (authorName.isEmpty()) { // meta tag approaches, get content result = doc.select("head meta[name=author]").first(); if (result != null) { authorName = cleanAuthorNameResult(result.attr("content")); } if (authorName.isEmpty()) { // for "opengraph" authorName = cleanAuthorNameResult(doc.select("head meta[property=article:author]").attr("content")); } if (authorName.isEmpty()) { // for "schema.org creativework" authorName = cleanAuthorNameResult(doc.select("meta[itemprop=author], span[itemprop=author]").attr("content")); } // other hacks if (authorName.isEmpty()) { try{ // build up a set of elements which have likely author-related terms // .X searches for class X Elements matches = doc.select(".byLineTag,.byline,.author,.by,.writer,.address"); if(matches == null || matches.size() == 0){ matches = doc.select("body [class*=author]"); } if(matches == null || matches.size() == 0){ matches = doc.select("body [title*=author]"); } // a hack for huffington post if(matches == null || matches.size() == 0){ matches = doc.select(".staff_info dl a[href]"); } // select the best element from them if(matches != null){ Element bestMatch = getBestMatchElement(matches); if(!(bestMatch == null)) { authorName = bestMatch.ownText(); if(authorName.length() < MIN_AUTHOR_NAME_LENGTH){ authorName = bestMatch.text(); } authorName = cleanAuthorNameResult(IGNORE_AUTHOR_PARTS.matcher(authorName).replaceAll("")); // Remove twitter handle: http://www.engadget.com/2014/04/29/freedompop-iphone/ int atIndex = authorName.indexOf("@"); if (atIndex > 0) { authorName = cleanAuthorNameResult(authorName.substring(0, atIndex)); } if(authorName.indexOf(",") != -1){ authorName = authorName.split(",")[0]; } } } } catch(Exception e){ System.out.println(e.toString()); } } } return authorName; } // Returns the author description or null protected String extractAuthorDescription(Document doc, String authorName){ String authorDesc = ""; if(authorName.equals("")) return ""; Elements nodes = doc.select(":containsOwn(" + authorName + ")"); Element bestMatch = getBestMatchElement(nodes); if(bestMatch != null) authorDesc = bestMatch.text(); return authorDesc; } protected Collection<String> extractKeywords(Document doc) { String content = SHelper.innerTrim(doc.select("head meta[name=keywords]").attr("content")); if (content != null) { if (content.startsWith("[") && content.endsWith("]")) content = content.substring(1, content.length() - 1); String[] split = content.split("\\s*,\\s*"); if (split.length > 1 || (split.length > 0 && !"".equals(split[0]))) return Arrays.asList(split); } return Collections.emptyList(); } /** * Tries to extract an image url from metadata if determineImageSource * failed * * @return image url or empty str */ protected String extractImageUrl(Document doc) { // use open graph tag to get image String imageUrl = SHelper.replaceSpaces(doc.select("head meta[property=og:image]").attr("content")); if (imageUrl.isEmpty()) { imageUrl = SHelper.replaceSpaces(doc.select("head meta[name=twitter:image]").attr("content")); if (imageUrl.isEmpty()) { // prefer link over thumbnail-meta if empty imageUrl = SHelper.replaceSpaces(doc.select("link[rel=image_src]").attr("href")); if (imageUrl.isEmpty()) { imageUrl = SHelper.replaceSpaces(doc.select("head meta[name=thumbnail]").attr("content")); } } } return imageUrl; } protected String extractRssUrl(Document doc) { return SHelper.replaceSpaces(doc.select("link[rel=alternate]").select("link[type=application/rss+xml]").attr("href")); } protected String extractVideoUrl(Document doc) { return SHelper.replaceSpaces(doc.select("head meta[property=og:video]").attr("content")); } protected String extractFaviconUrl(Document doc) { String faviconUrl = SHelper.replaceSpaces(doc.select("head link[rel=icon]").attr("href")); if (faviconUrl.isEmpty()) { faviconUrl = SHelper.replaceSpaces(doc.select("head link[rel^=shortcut],link[rel$=icon]").attr("href")); } return faviconUrl; } /** * Weights current element. By matching it with positive candidates and * weighting child nodes. Since it's impossible to predict which exactly * names, ids or class names will be used in HTML, major role is played by * child nodes * * @param e Element to weight, along with child nodes */ protected int getWeight(Element e, boolean checkextra) { int weight = calcWeight(e); weight += (int) Math.round(e.ownText().length() / 100.0 * 10); weight += weightChildNodes(e); // add additional weight using possible 'extragravityscore' attribute if (checkextra) { Element xelem = e.select("[extragravityscore]").first(); if (xelem != null) { // System.out.println("HERE found one: " + xelem.toString()); weight += Integer.parseInt(xelem.attr("extragravityscore")); // System.out.println("WITH WEIGHT: " + xelem.attr("extragravityscore")); } } return weight; } /** * Weights a child nodes of given Element. During tests some difficulties * were met. For instanance, not every single document has nested paragraph * tags inside of the major article tag. Sometimes people are adding one * more nesting level. So, we're adding 4 points for every 100 symbols * contained in tag nested inside of the current weighted element, but only * 3 points for every element that's nested 2 levels deep. This way we give * more chances to extract the element that has less nested levels, * increasing probability of the correct extraction. * * @param rootEl Element, who's child nodes will be weighted */ protected int weightChildNodes(Element rootEl) { int weight = 0; Element caption = null; List<Element> pEls = new ArrayList<Element>(5); for (Element child : rootEl.children()) { String ownText = child.ownText(); int ownTextLength = ownText.length(); if (ownTextLength < 20) continue; if (ownTextLength > 200) weight += Math.max(50, ownTextLength / 10); if (child.tagName().equals("h1") || child.tagName().equals("h2")) { weight += 30; } else if (child.tagName().equals("div") || child.tagName().equals("p")) { weight += calcWeightForChild(child, ownText); if (child.tagName().equals("p") && ownTextLength > 50) pEls.add(child); if (child.className().toLowerCase().equals("caption")) caption = child; } } // use caption and image if (caption != null) weight += 30; if (pEls.size() >= 2) { for (Element subEl : rootEl.children()) { if ("h1;h2;h3;h4;h5;h6".contains(subEl.tagName())) { weight += 20; // headerEls.add(subEl); } else if ("table;li;td;th".contains(subEl.tagName())) { addScore(subEl, -30); } if ("p".contains(subEl.tagName())) addScore(subEl, 30); } } return weight; } public void addScore(Element el, int score) { int old = getScore(el); setScore(el, score + old); } public int getScore(Element el) { int old = 0; try { old = Integer.parseInt(el.attr("gravityScore")); } catch (Exception ex) { } return old; } public void setScore(Element el, int score) { el.attr("gravityScore", Integer.toString(score)); } private int calcWeightForChild(Element child, String ownText) { int c = SHelper.count(ownText, """); c += SHelper.count(ownText, "<"); c += SHelper.count(ownText, ">"); c += SHelper.count(ownText, "px"); int val; if (c > 5) val = -30; else val = (int) Math.round(ownText.length() / 25.0); addScore(child, val); return val; } private int calcWeight(Element e) { int weight = 0; if (POSITIVE.matcher(e.className()).find()) weight += 35; if (POSITIVE.matcher(e.id()).find()) weight += 40; if (UNLIKELY.matcher(e.className()).find()) weight -= 20; if (UNLIKELY.matcher(e.id()).find()) weight -= 20; if (NEGATIVE.matcher(e.className()).find()) weight -= 50; if (NEGATIVE.matcher(e.id()).find()) weight -= 50; String style = e.attr("style"); if (style != null && !style.isEmpty() && NEGATIVE_STYLE.matcher(style).find()) weight -= 50; return weight; } public Element determineImageSource(Element el, List<ImageResult> images) { int maxWeight = 0; Element maxNode = null; Elements els = el.select("img"); if (els.isEmpty()) els = el.parent().select("img"); double score = 1; for (Element e : els) { String sourceUrl = e.attr("src"); if (sourceUrl.isEmpty() || isAdImage(sourceUrl)) continue; int weight = 0; int height = 0; try { height = Integer.parseInt(e.attr("height")); if (height >= 50) weight += 20; else weight -= 20; } catch (Exception ex) { } int width = 0; try { width = Integer.parseInt(e.attr("width")); if (width >= 50) weight += 20; else weight -= 20; } catch (Exception ex) { } String alt = e.attr("alt"); if (alt.length() > 35) weight += 20; String title = e.attr("title"); if (title.length() > 35) weight += 20; String rel = null; boolean noFollow = false; if (e.parent() != null) { rel = e.parent().attr("rel"); if (rel != null && rel.contains("nofollow")) { noFollow = rel.contains("nofollow"); weight -= 40; } } weight = (int) (weight * score); if (weight > maxWeight) { maxWeight = weight; maxNode = e; score = score / 2; } ImageResult image = new ImageResult(sourceUrl, weight, title, height, width, alt, noFollow); images.add(image); } Collections.sort(images, new ImageComparator()); return maxNode; } /** * Prepares document. Currently only stipping unlikely candidates, since * from time to time they're getting more score than good ones especially in * cases when major text is short. * * @param doc document to prepare. Passed as reference, and changed inside * of function */ protected void prepareDocument(Document doc) { // stripUnlikelyCandidates(doc); removeScriptsAndStyles(doc); } /** * Removes unlikely candidates from HTML. Currently takes id and class name * and matches them against list of patterns * * @param doc document to strip unlikely candidates from */ protected void stripUnlikelyCandidates(Document doc) { for (Element child : doc.select("body").select("*")) { String className = child.className().toLowerCase(); String id = child.id().toLowerCase(); if (NEGATIVE.matcher(className).find() || NEGATIVE.matcher(id).find()) { // print("REMOVE:", child); child.remove(); } } } private Document removeScriptsAndStyles(Document doc) { Elements scripts = doc.getElementsByTag("script"); for (Element item : scripts) { item.remove(); } Elements noscripts = doc.getElementsByTag("noscript"); for (Element item : noscripts) { item.remove(); } Elements styles = doc.getElementsByTag("style"); for (Element style : styles) { style.remove(); } return doc; } private void print(Element child) { print("", child, ""); } private void print(String add, Element child) { print(add, child, ""); } private void print(String add1, Element child, String add2) { logger.info(add1 + " " + child.nodeName() + " id=" + child.id() + " class=" + child.className() + " text=" + child.text() + " " + add2); } private boolean isAdImage(String imageUrl) { return SHelper.count(imageUrl, "ad") >= 2; } /** * Match only exact matching as longestSubstring can be too fuzzy */ public String removeTitleFromText(String text, String title) { // don't do this as its terrible to read // int index1 = text.toLowerCase().indexOf(title.toLowerCase()); // if (index1 >= 0) // text = text.substring(index1 + title.length()); // return text.trim(); return text; } /** * based on a delimeter in the title take the longest piece or do some * custom logic based on the site * * @param title * @param delimeter * @return */ private String doTitleSplits(String title, String delimeter) { String largeText = ""; int largetTextLen = 0; String[] titlePieces = title.split(delimeter); // take the largest split for (String p : titlePieces) { if (p.length() > largetTextLen) { largeText = p; largetTextLen = p.length(); } } largeText = largeText.replace("»", " "); largeText = largeText.replace("ยป", " "); return largeText.trim(); } /** * @return a set of all important nodes */ public Collection<Element> getNodes(Document doc) { Map<Element, Object> nodes = new LinkedHashMap<Element, Object>(64); int score = 100; for (Element el : doc.select("body").select("*")) { if (NODES.matcher(el.tagName()).matches()) { nodes.put(el, null); setScore(el, score); score = score / 2; } } return nodes.keySet(); } public String cleanTitle(String title) { StringBuilder res = new StringBuilder(); // int index = title.lastIndexOf("|"); // if (index > 0 && title.length() / 2 < index) // title = title.substring(0, index + 1); int counter = 0; String[] strs = title.split("\\|"); for (String part : strs) { if (IGNORED_TITLE_PARTS.contains(part.toLowerCase().trim())) continue; if (counter == strs.length - 1 && res.length() > part.length()) continue; if (counter > 0) res.append("|"); res.append(part); counter++; } return SHelper.innerTrim(res.toString()); } /** * Comparator for Image by weight * * @author Chris Alexander, chris@chris-alexander.co.uk * */ public class ImageComparator implements Comparator<ImageResult> { @Override public int compare(ImageResult o1, ImageResult o2) { // Returns the highest weight first return o2.weight.compareTo(o1.weight); } } }