ArticleTextExtractor.java example

Explorer

Flym-master
- Flym
  - src
    - androidTest
      - java
        net
        fred
        feedex
        ApplicationTest.java
    - main
      - java
        net
        fred
        feedex
        Constants.java
        MainApplication.java
        activity
        AboutActivity.java
        AddGoogleNewsActivity.java
        BaseActivity.java
        EditFeedActivity.java
        EditFeedsListActivity.java
        EntryActivity.java
        GeneralPrefsActivity.java
        HomeActivity.java
        adapter
        CursorLoaderExpandableListAdapter.java
        DrawerAdapter.java
        EntriesCursorAdapter.java
        FeedsCursorAdapter.java
        FiltersCursorAdapter.java
        fragment
        EditFeedsListFragment.java
        EntriesListFragment.java
        EntryFragment.java
        GeneralPrefsFragment.java
        SwipeRefreshFragment.java
        SwipeRefreshListFragment.java
        loader
        BaseLoader.java
        parser
        OPML.java
        RssAtomParser.java
        provider
        DatabaseHelper.java
        FeedData.java
        FeedDataContentProvider.java
        service
        AutoRefreshService.java
        FetcherService.java
        utils
        ArticleTextExtractor.java
        Dog.java
        FileUtils.java
        HtmlUtils.java
        NetworkUtils.java
        PrefUtils.java
        StringUtils.java
        ThrottledContentObserver.java
        UiUtils.java
        view
        AutoSummaryListPreference.java
        BakedBezierInterpolator.java
        DragNDropExpandableListView.java
        DragNDropListener.java
        EntryView.java
        SwipeProgressBar.java
        SwipeRefreshLayout.java
        wear
        SendDataService.java
        widget
        ColorPickerDialogPreference.java
        TickerWidgetProvider.java
        TickerWidgetService.java
        WidgetConfigActivity.java
        WidgetConfigFragment.java
        WidgetProvider.java
        WidgetService.java

package net.fred.feedex.utils;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.regex.Pattern;

/**
 * This class is thread safe.
 *
 * @author Alex P (ifesdjeen from jreadability)
 * @author Peter Karich
 */
public class ArticleTextExtractor {

    // Interesting nodes
    private static final Pattern NODES = Pattern.compile("p|div|td|h1|h2|article|section");

    // Unlikely candidates
    private static final Pattern UNLIKELY = Pattern.compile("com(bx|ment|munity)|dis(qus|cuss)|e(xtra|[-]?mail)|foot|"
            + "header|menu|re(mark|ply)|rss|sh(are|outbox)|social|twitter|facebook|sponsor"
            + "a(d|ll|gegate|rchive|ttachment)|(pag(er|ination))|popup|print|"
            + "login|si(debar|gn|ngle)|hinweis|expla(in|nation)?|metablock");

    // Most likely positive candidates
    private static final Pattern POSITIVE = Pattern.compile("(^(body|content|h?entry|main|page|post|text|blog|story|haupt))"
            + "|arti(cle|kel)|instapaper_body");

    // Very most likely positive candidates, used by Joomla CMS
    private static final Pattern ITSJOOMLA = Pattern.compile("articleBody");
    
    // Most likely negative candidates
    private static final Pattern NEGATIVE = Pattern.compile("nav($|igation)|user|com(ment|bx)|(^com-)|contact|"
            + "foot|masthead|(me(dia|ta))|outbrain|promo|related|scroll|(sho(utbox|pping))|"
            + "sidebar|sponsor|tags|tool|widget|player|disclaimer|toc|infobox|vcard");

    private static final Pattern NEGATIVE_STYLE =
            Pattern.compile("hidden|display: ?none|font-size: ?small");

    /**
     * @param input            extracts article text from given html string. wasn't tested
     *                         with improper HTML, although jSoup should be able to handle minor stuff.
     * @param contentIndicator a text which should be included into the extracted content, or null
     * @return extracted article, all HTML tags stripped
     */
    public static String extractContent(InputStream input, String contentIndicator) throws Exception {
        return extractContent(Jsoup.parse(input, null, ""), contentIndicator);
    }

    public static String extractContent(Document doc, String contentIndicator) {
        if (doc == null)
            throw new NullPointerException("missing document");

        // now remove the clutter
        prepareDocument(doc);

        // init elements
        Collection<Element> nodes = getNodes(doc);
        int maxWeight = 0;
        Element bestMatchElement = null;

        for (Element entry : nodes) {
            int currentWeight = getWeight(entry, contentIndicator);
            if (currentWeight > maxWeight) {
                maxWeight = currentWeight;
                bestMatchElement = entry;

                if (maxWeight > 300) {
                    break;
                }
            }
        }

        Collection<Element> metas = getMetas(doc);
        String ogImage = null;
        for (Element entry : metas) {
            if (entry.hasAttr("property") && "og:image".equals(entry.attr("property"))) {
                ogImage = entry.attr("content");
                break;
            }
        }

        if (bestMatchElement != null) {
            String ret = bestMatchElement.toString();
            if (ogImage != null && !ret.contains(ogImage)) {
                ret = "<img src=\""+ogImage+"\"><br>\n"+ret;
            }
            return ret;
        }

        return null;
    }

    /**
     * Weights current element. By matching it with positive candidates and
     * weighting child nodes. Since it's impossible to predict which exactly
     * names, ids or class names will be used in HTML, major role is played by
     * child nodes
     *
     * @param e                Element to weight, along with child nodes
     * @param contentIndicator a text which should be included into the extracted content, or null
     */
    private static int getWeight(Element e, String contentIndicator) {
        int weight = calcWeight(e);
        weight += (int) Math.round(e.ownText().length() / 100.0 * 10);
        weight += weightChildNodes(e, contentIndicator);
        return weight;
    }

    /**
     * Weights a child nodes of given Element. During tests some difficulties
     * were met. For instance, not every single document has nested paragraph
     * tags inside of the major article tag. Sometimes people are adding one
     * more nesting level. So, we're adding 4 points for every 100 symbols
     * contained in tag nested inside of the current weighted element, but only
     * 3 points for every element that's nested 2 levels deep. This way we give
     * more chances to extract the element that has less nested levels,
     * increasing probability of the correct extraction.
     *
     * @param rootEl           Element, who's child nodes will be weighted
     * @param contentIndicator a text which should be included into the extracted content, or null
     */
    private static int weightChildNodes(Element rootEl, String contentIndicator) {
        int weight = 0;
        Element caption = null;
        List<Element> pEls = new ArrayList<>(5);
        for (Element child : rootEl.children()) {
            String text = child.text();
            int textLength = text.length();
            if (textLength < 20) {
                continue;
            }

            if (contentIndicator != null && text.contains(contentIndicator)) {
                weight += 100; // We certainly found the item
            }

            String ownText = child.ownText();
            int ownTextLength = ownText.length();
            if (ownTextLength > 200) {
                weight += Math.max(50, ownTextLength / 10);
            }

            if (child.tagName().equals("h1") || child.tagName().equals("h2")) {
                weight += 30;
            } else if (child.tagName().equals("div") || child.tagName().equals("p")) {
                weight += calcWeightForChild(ownText);
                if (child.tagName().equals("p") && textLength > 50)
                    pEls.add(child);

                if (child.className().toLowerCase().equals("caption"))
                    caption = child;
            }
        }

        // use caption and image
        if (caption != null)
            weight += 30;

        if (pEls.size() >= 2) {
            for (Element subEl : rootEl.children()) {
                if ("h1;h2;h3;h4;h5;h6".contains(subEl.tagName())) {
                    weight += 20;
                    // headerEls.add(subEl);
                }
            }
        }
        return weight;
    }

    private static int calcWeightForChild(String text) {
        return text.length() / 25;
//		return Math.min(100, text.length() / ((child.getAllElements().size()+1)*5));
    }

    private static int calcWeight(Element e) {
        int weight = 0;
        if (POSITIVE.matcher(e.className()).find())
            weight += 35;

        if (POSITIVE.matcher(e.id()).find())
            weight += 40;

        if (ITSJOOMLA.matcher(e.attributes().toString()).find())
            weight += 200;

        if (UNLIKELY.matcher(e.className()).find())
            weight -= 20;

        if (UNLIKELY.matcher(e.id()).find())
            weight -= 20;

        if (NEGATIVE.matcher(e.className()).find())
            weight -= 50;

        if (NEGATIVE.matcher(e.id()).find())
            weight -= 50;

        String style = e.attr("style");
        if (style != null && !style.isEmpty() && NEGATIVE_STYLE.matcher(style).find())
            weight -= 50;
        return weight;
    }

    /**
     * Prepares document. Currently only stipping unlikely candidates, since
     * from time to time they're getting more score than good ones especially in
     * cases when major text is short.
     *
     * @param doc document to prepare. Passed as reference, and changed inside
     *            of function
     */
    private static void prepareDocument(Document doc) {
        // stripUnlikelyCandidates(doc);
        removeSelectsAndOptions(doc);
        removeScriptsAndStyles(doc);
    }

    /**
     * Removes unlikely candidates from HTML. Currently takes id and class name
     * and matches them against list of patterns
     *
     * @param doc document to strip unlikely candidates from
     */
//    protected void stripUnlikelyCandidates(Document doc) {
//        for (Element child : doc.select("body").select("*")) {
//            String className = child.className().toLowerCase();
//            String id = child.id().toLowerCase();
//
//            if (NEGATIVE.matcher(className).find()
//                    || NEGATIVE.matcher(id).find()) {
//                child.remove();
//            }
//        }
//    }
    private static Document removeScriptsAndStyles(Document doc) {
        Elements scripts = doc.getElementsByTag("script");
        for (Element item : scripts) {
            item.remove();
        }

        Elements noscripts = doc.getElementsByTag("noscript");
        for (Element item : noscripts) {
            item.remove();
        }

        Elements styles = doc.getElementsByTag("style");
        for (Element style : styles) {
            style.remove();
        }

        return doc;
    }

    private static Document removeSelectsAndOptions(Document doc) {
        Elements scripts = doc.getElementsByTag("select");
        for (Element item : scripts) {
            item.remove();
        }

        Elements noscripts = doc.getElementsByTag("option");
        for (Element item : noscripts) {
            item.remove();
        }

        return doc;
    }

    /**
     * @return a set of all meta nodes
     */
    private static Collection<Element> getMetas(Document doc) {
        Collection<Element> nodes = new HashSet<>(64);
        for (Element el : doc.select("head").select("meta")) {
            nodes.add(el);
        }
        return nodes;
    }

    /**
     * @return a set of all important nodes
     */
    private static Collection<Element> getNodes(Document doc) {
        Collection<Element> nodes = new HashSet<>(64);
        for (Element el : doc.select("body").select("*")) {
            if (NODES.matcher(el.tagName()).matches()) {
                nodes.add(el);
            }
        }
        return nodes;
    }
}