Textify.java example

Explorer
feedscribe-master
- src
  - net
    - oddsoftware
      - android
        feedscribe
        AudioPlayer.java
        Globals.java
        data
        Download.java
        Downloader.java
        Enclosure.java
        Feed.java
        FeedConfig.java
        FeedDBAdaptor.java
        FeedEnclosureInfo.java
        FeedItem.java
        FeedItemEnclosureInfo.java
        FeedManager.java
        FeedSettings.java
        FeedStatus.java
        FeedUpdateListener.java
        PlaylistProvider.java
        ShortFeedItem.java
        service
        FeedService.java
        ScheduleReceiver.java
        ui
        AddItemActivity.java
        DownloadsActivity.java
        DownloadsListAdapter.java
        FeedSettingsActivity.java
        FeedsActivity.java
        FileDialog.java
        LogListActivity.java
        MyViewFlipper.java
        NewsActivity.java
        NewsFeedsListAdapter.java
        NewsItemsListAdapter.java
        NewsPreferencesActivity.java
        PlayAudioActivity.java
        PlayVideoActivity.java
        PodcastFeedsListAdaptor.java
        PodcastItemsListAdapter.java
        PodcastsActivity.java
        SubscribeActivity.java
        html
        CacheItem.java
        HttpCache.java
        HttpCacheProvider.java
        Textify.java
        utils
        FlushedInputStream.java
        LogEntry.java
        Logger.java
        LoggerFactory.java
        MediaScan.java
        TagParser.java
        Utilities.java
        WakefulIntentService.java
        WorkerThread.java
  - org
    - htmlcleaner
/*
 *  Copyright 2012 Brendan McCarthy (brendan@oddsoftware.net)
 *
 *  This file is part of Feedscribe.
 *
 *  Feedscribe is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License version 3 
 *  as published by the Free Software Foundation.
 *
 *  Feedscribe is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with Feedscribe.  If not, see <http://www.gnu.org/licenses/>.
 */
package net.oddsoftware.android.html;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import net.oddsoftware.android.feedscribe.Globals;

import org.htmlcleaner.BrowserCompactXmlSerializer;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.CleanerTransformations;
import org.htmlcleaner.ContentNode;
import org.htmlcleaner.DoctypeToken;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.HtmlNode;
import org.htmlcleaner.TagNode;
import org.htmlcleaner.TagNodeVisitor;
import org.htmlcleaner.TagTransformation;
import org.htmlcleaner.XPatherException;

import android.util.Log;

public class Textify
{
    protected String mTitle = null;
    
    private String mAuthor = null;

    private String mPubDate = null;
    
    protected HashMap<TagNode, Integer> mTagScores;
    protected HashMap<String, HashSet<String> > mAllowedAttributes;
    protected HashSet<String > mBlacklistTags;
    protected HashMap<String, String> mConvertTags;
    
    protected Pattern mUnlikelyCandidates;
    protected Pattern mOkMaybeItsACandidates;
    protected Pattern mDivToPElements;
    protected Pattern mPositiveClassNames;
    protected Pattern mNegativeClassNames;
    protected Pattern mSentencePattern;
    
    protected HtmlCleaner mCleaner;
    
    protected TagNode mArticleRoot;
    
    protected boolean mProcessingEnabled;

    private boolean mStripUnlikelyCandidates = true;

    private int mBestScore;
    
    private boolean mForcePageWidth = false;
    
    private String mViewport = null;
    
    public Textify()
    {
        mUnlikelyCandidates = Pattern.compile("combx|comment|community|disqus|extra|foot|header|menu|remark|rss|" +
                "shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter",
                Pattern.CASE_INSENSITIVE);
        
        // header is in here for espnf1 and it should be fine since the header will be boring by other means
        mOkMaybeItsACandidates = Pattern.compile("and|article|body|column|main|shadow|commentary-panel|header",
                Pattern.CASE_INSENSITIVE);
        
        mDivToPElements = Pattern.compile("a|blockquote|dl|div|img|ol|p|pre|table|ul",
                Pattern.CASE_INSENSITIVE);
        
        mPositiveClassNames = Pattern.compile("article|body|content|entry|hentry|main|page|pagination|post|text|blog|story|stry|" +
        		"datetools",
                Pattern.CASE_INSENSITIVE);
        
        mNegativeClassNames = Pattern.compile("combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|" +
                "promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget|contentheading",
                Pattern.CASE_INSENSITIVE);
        
        mSentencePattern = Pattern.compile("\\.( |$)");
        
        
        mAllowedAttributes = new HashMap<String, HashSet<String> >();
        
        HashSet<String> emptySet = new HashSet<String>();
        
        mAllowedAttributes.put("p", emptySet );
        mAllowedAttributes.put("div", emptySet );
        mAllowedAttributes.put("br", emptySet );
        mAllowedAttributes.put("b", emptySet );
        mAllowedAttributes.put("big", emptySet );
        mAllowedAttributes.put("center", emptySet );
        mAllowedAttributes.put("code", emptySet );
        mAllowedAttributes.put("cite", emptySet );
        mAllowedAttributes.put("del", emptySet );
        mAllowedAttributes.put("dfn", emptySet );
        mAllowedAttributes.put("em", emptySet );
        mAllowedAttributes.put("font", emptySet ); // color, face, size
        mAllowedAttributes.put("u", emptySet );
        mAllowedAttributes.put("i", emptySet );
        mAllowedAttributes.put("ins", emptySet );
        mAllowedAttributes.put("kbd", emptySet );
        mAllowedAttributes.put("pre", emptySet );
        mAllowedAttributes.put("s", emptySet );
        mAllowedAttributes.put("samp", emptySet );
        mAllowedAttributes.put("small", emptySet );
        mAllowedAttributes.put("strong", emptySet );
        mAllowedAttributes.put("sub", emptySet );
        mAllowedAttributes.put("sup", emptySet );
        mAllowedAttributes.put("span", emptySet );
        mAllowedAttributes.put("strike", emptySet );
        mAllowedAttributes.put("tt", emptySet );
        mAllowedAttributes.put("var", emptySet );
        mAllowedAttributes.put("abbr", emptySet );
        mAllowedAttributes.put("acronym", emptySet );
        mAllowedAttributes.put("address", emptySet );
        mAllowedAttributes.put("blockquote", emptySet );
        mAllowedAttributes.put("q", emptySet );
        mAllowedAttributes.put("wbr", emptySet );
        mAllowedAttributes.put("nobr", emptySet );
        mAllowedAttributes.put("xmp", emptySet );
        mAllowedAttributes.put("hr", emptySet );
        
        mAllowedAttributes.put("th", emptySet );
        mAllowedAttributes.put("td", emptySet );
        mAllowedAttributes.put("tr", emptySet );
        mAllowedAttributes.put("thead", emptySet );
        mAllowedAttributes.put("tbody", emptySet );
        mAllowedAttributes.put("tfoot", emptySet );
        mAllowedAttributes.put("col", emptySet );
        mAllowedAttributes.put("colgroup", emptySet );
        mAllowedAttributes.put("caption", emptySet );
        
        mAllowedAttributes.put("li", emptySet );
        mAllowedAttributes.put("ul", emptySet );
        mAllowedAttributes.put("ol", emptySet );
        mAllowedAttributes.put("dd", emptySet );
        mAllowedAttributes.put("dl", emptySet );
        mAllowedAttributes.put("dt", emptySet );
        mAllowedAttributes.put("menu", emptySet );
        mAllowedAttributes.put("dir", emptySet );
        
        mAllowedAttributes.put("a", new HashSet<String>(Arrays.asList("href") ) );
        mAllowedAttributes.put("img", new HashSet<String>(Arrays.asList("alt", "src", "title", "width", "height", "align", "usemap") ) );
        mAllowedAttributes.put("bdo", new HashSet<String>(Arrays.asList( "dir" ) ) );
        mAllowedAttributes.put("map", new HashSet<String>(Arrays.asList("name") ) );
        mAllowedAttributes.put("area", new HashSet<String>(Arrays.asList("shape", "coords", "href", "alt") ) );
        
        mAllowedAttributes.put("table", new HashSet<String>(Arrays.asList( "width", "cellspacing", "cellpadding", "border", "align" ) ) );
        
        mConvertTags = new HashMap<String, String>();
        mConvertTags.put( "blink", "span" );
        mConvertTags.put( "marquee", "span" );
        
        mBlacklistTags = new HashSet<String>();
        mBlacklistTags.add( "meta" );
        mBlacklistTags.add( "link" );
        mBlacklistTags.add( "style" );
        mBlacklistTags.add( "bgsound" );
        mBlacklistTags.add( "base" );
        mBlacklistTags.add( "object" );
        mBlacklistTags.add( "applet" );
        mBlacklistTags.add( "param" );
        mBlacklistTags.add( "script" );
        mBlacklistTags.add( "noscript" );
        mBlacklistTags.add( "basefont" );
        mBlacklistTags.add( "comment" );
        mBlacklistTags.add( "server" );
        mBlacklistTags.add( "iframe" );
        mBlacklistTags.add( "embed" );
        
        
        mBlacklistTags.add( "form" );
        mBlacklistTags.add( "input" );
        mBlacklistTags.add( "option" );
        mBlacklistTags.add( "textarea" );
        mBlacklistTags.add( "select" );
        mBlacklistTags.add( "optgroup" );
        mBlacklistTags.add( "button" );
        mBlacklistTags.add( "label" );
        mBlacklistTags.add( "fieldset" );
        mBlacklistTags.add( "legend" );
        mBlacklistTags.add( "isindex" );
        
        
        mCleaner = new HtmlCleaner();
        
        CleanerProperties props = mCleaner.getProperties();
        props.setOmitComments(true);
        props.setOmitDoctypeDeclaration(false);
        props.setTransResCharsToNCR(true);
        props.setPruneTags("script");
        props.setUseEmptyElementTags(false);
        
        CleanerTransformations transformations = new CleanerTransformations();
        transformations.addTransformation( new TagTransformation("noscript", "div") );
        mCleaner.setTransformations(transformations);
        
        mProcessingEnabled = true;
    }
    
    public void setProcessingEnabled(boolean processingEnabled)
    {
        mProcessingEnabled = processingEnabled;
    }
    
    public void setStripUnlikelyCandidates(boolean stripUnlikelyCandidates)
    {
        mStripUnlikelyCandidates = stripUnlikelyCandidates;
    }
    
    public void setViewport(String viewport)
    {
        mViewport = viewport;
    }
    
    public int getArticleScore()
    {
        return mBestScore;
    }
    
    public void process(String input)
    {
        TagNode root = mCleaner.clean( input );
        
        mBestScore = 0;
        mTagScores = new HashMap<TagNode, Integer>();
        
        if( mTitle == null )
        {
            mTitle = getArticleTitle(  root );
        }
        
        if( mProcessingEnabled )
        {
            preprocess(root);
            
            TagNode articleRoot = getArticleRoot( root );
            
            // System.out.println("<div>" + mCleaner.getInnerHtml( articleRoot ) + "</div>" );
            
            cleanArticle( articleRoot );
            
            // System.out.println("<div>" + mCleaner.getInnerHtml( articleRoot ) + "</div>" );
            
            mArticleRoot = articleRoot;
        }
        else
        {
            mArticleRoot = root.findElementByName("body", false);
            if(mArticleRoot == null)
            {
                mArticleRoot = root;
            }
        }
    }
    
    
    public void process(InputStream input) throws IOException
    {
        TagNode root = mCleaner.clean( input );
        
        mTagScores = new HashMap<TagNode, Integer>();
        
        if( mTitle == null )
        {
            mTitle = getArticleTitle(  root );
        }
        
        if( mProcessingEnabled )
        {
            
            preprocess(root);
            
            TagNode articleRoot = getArticleRoot( root );
            
            // System.out.println("<div>" + mCleaner.getInnerHtml( articleRoot ) + "</div>" );
            
            cleanArticle( articleRoot );
            
            // System.out.println("<div>" + mCleaner.getInnerHtml( articleRoot ) + "</div>" );
            
            mArticleRoot = articleRoot;
        }
        else
        {
            mArticleRoot = root.findElementByName("body", false);
            if(mArticleRoot == null)
            {
                mArticleRoot = root;
            }
        }
    }
    
    public void setTitle(String title)
    {
        mTitle = title;
    }
    
    public void setAuthor(String author)
    {
        mAuthor = author;
    }
    
    public void setPubDate(String pubDate)
    {
        mPubDate = pubDate;
    }
    
    public String getTitle()
    {
        return mTitle;
    }
    
    
    /*
    private ArrayList<TagNode> findNextPageLink()
    {
        return null;
    }
    
    private String findBaseUrl(String url)
    {
        return url;
    }
    */
    
    private double getLinkDensity(TagNode node)
    {
        TagNode[] links = node.getElementsByName("a", true);
        int textLength = countChars(node.getText());
        int linkLength = 0;
        
        for( int i = 0; i < links.length; ++i )
        {
            linkLength += countChars(links[i].getText());
        }
        
        if( textLength == 0 && links.length > 0 )
        {
            return 1.0;
        }
        else if (links.length == 0 )
        {
            return 0.0;
        }
        else
        {
            return linkLength / (double) textLength;
        }
    }
    
    private int countChars(StringBuffer buffer)
    {
        int count = 0;
        for( int i = 0; i < buffer.length(); ++i)
        {
            if(! Character.isWhitespace( buffer.charAt(i)))
            {
                ++count;
            }
        }
        return count;
    }
    
    /**
     * 
     * Find the article title node and process it
     * then find the first h1 and process it
     * 
     * @param root
     * @return
     */
    private String getArticleTitle( TagNode root )
    {
        String title = "";
        String originalTitle = "";
        
        Pattern p2 = Pattern.compile(":(.*)");
        
        // find the <title> tag and process it
        try
        {
            Object[] titles = root.evaluateXPath("/head/title");
            
            if( titles.length > 0 && titles[0] instanceof TagNode)
            {
                title = ((TagNode)titles[0]).getText().toString();
                originalTitle = title;
            }
        }
        catch(XPatherException exc)
        {
            if( Globals.LOGGING ) Log.e(Globals.LOG_TAG, "getArticleTitle:", exc);
        }
        
        // see if there is any "arttitle" tag - wtf is this ?
        TagNode artTitle = root.findElementByName("arttitle", true);
        
        // see if title is of the form "title - fred news" or "title | joe news"
        String[] parts = title.split("\\s+[|-]");
        // find the longest part
        String longest = null;
        for( int i = 0; i < parts.length; ++i)
        {
            if( longest == null || parts[i].length() > longest.length())
            {
                longest = parts[i];
            }
        }
        
        // override the above in the case of only 2 parts, use the first one
        if( parts.length == 2)
        {
            longest = parts[0];
        }
        
        // System.out.println("looking for title, have " + title + " and " + originalTitle + " and " + longest + " of " + parts.length);
        
        Matcher matcher;
        
        if( artTitle != null )
        {
            title = artTitle.getText().toString();
        }
        else if( parts.length >= 1 && longest != null )
        {
            title = longest;
        }
        // see if title is of the form "foo news: title"
        else if( (matcher = p2.matcher(title)).find() )
        {
            title = matcher.group(1);
        }
        else
        {
            // System.out.println("no title match");
        }
        
        // check title length, then try and find the one and only h1
        if( title.length() > 150 || title.length() < 15 )
        {
            // System.out.println("title '" + title + "' is of wrong size " + title.length());
            
            TagNode[] h1s = root.getElementsByName("h1", true);
            if( h1s.length == 1)
            {
                title = h1s[0].getText().toString();
                // System.out.println("replacing title title of wrong size with '" + title + "'");
            }
        }
        
        title = title.trim();
        
        // if it has less than 4 words, use the original, unprocessed title
        if( title.split(" ").length < 4)
        {
            // System.out.println(" title '" + title + "' is too small, using '" + originalTitle + "' instead" );
            title = originalTitle;
        }
        
        return title;
    }
    
    private int getClassWeight(TagNode node)
    {
        int weight = 0;
        
        String nodeClass = node.getAttributeByName("class");
        String nodeId = node.getAttributeByName("id");
        
        // Look for a special classname
        if( nodeClass != null )
        {
            if( mNegativeClassNames.matcher( nodeClass ).find() )
            {
                weight -= 25;
            }

            if( mPositiveClassNames.matcher( nodeClass ).find() )
            {
                weight += 25;
            }
        }
        
        if( nodeId != null )
        {
            if( mNegativeClassNames.matcher( nodeId ).find() )
            {
                weight -= 25;
            }

            if( mPositiveClassNames.matcher( nodeId ).find() )
            {
                weight += 25;
            }
        }

        return weight;

    }
    
    private void addScore(TagNode node, int score)
    {
        Integer integer = mTagScores.get(node);
        
        if( integer != null )
        {
            score += integer;
        }
        else
        {
            String name = node.getName();
            
            if( name.equals("div") )
            {
                score += 5;
            }
            else if (
                    name.equals("pre") ||
                    name.equals("td") ||
                    name.equals("blockquote")
                    )
            {
                score += 3;
            }
            else if (
                    name.equals("address") ||
                    name.equals("ol")      ||
                    name.equals("ul")      ||
                    name.equals("dl")      ||
                    name.equals("dd")      ||
                    name.equals("dt")      ||
                    name.equals("li")      ||
                    name.equals("form")
                    )
            {
                score += -3;
            }
            else if (
                    name.equals("th") ||
                    name.equals("h1") ||
                    name.equals("h2") ||
                    name.equals("h3") ||
                    name.equals("h4") ||
                    name.equals("h5") ||
                    name.equals("h6")
                    )
            {
                score += -5;
            }
            
            score += getClassWeight(node);
        }
        
        mTagScores.put(node, score);
    }
    
    
    private TagNode getArticleRoot( TagNode root )
    {
        TagNode[] allElements = root.getAllElements(true);
        ArrayList<TagNode> nodesToScore = new ArrayList<TagNode>();
        
        for( int i = 0; i < allElements.length; ++i)
        {
            TagNode currentNode = allElements[i];
            String tagName = currentNode.getName();
            
            if( mStripUnlikelyCandidates   )
            {
                String unlikelyMatchString = "" + currentNode.getAttributeByName("id") + currentNode.getAttributeByName("class");
                
                // see if we are still in the tree
                TagNode parent = currentNode;
                while( parent != null && parent != root )
                {
                    parent = parent.getParent();
                }
                if( parent != root )
                {
                    continue;
                }
                
                
                // String unlikelyMatchString = "" + currentNode.getAttributeByName("id") + currentNode.getAttributeByName("class");
                
                // System.out.println("getArticleRoot processing " + tagName + " " + unlikelyMatchString + " " + currentNode.getAttributeByName("style"));
                
                // chuck out obviously bad nodes
                if( 
                        mUnlikelyCandidates.matcher( unlikelyMatchString ).find() && 
                        !mOkMaybeItsACandidates.matcher( unlikelyMatchString ).find() &&
                        !tagName.equals("body")
                        )
                {
                    currentNode.removeFromTree();
                    continue;
                }
            }
            
            if(
                    tagName.equals("p") ||
                    tagName.equals("td") ||
                    tagName.equals("pre")
                    )
            {
                nodesToScore.add(currentNode);
            }
            
            // score the li directly if it has nothing interesting inside it
            if( tagName.equals("li") )
            {
                TagNode[] children = currentNode.getAllElements(true);
                boolean scoreNode = true;
                for( int j = 0; j < children.length; ++j)
                {
                    String childName = children[j].getName();
                    if( mDivToPElements.matcher( childName ).find() )
                    {
                        scoreNode = false;
                        break;
                    }
                }
                if (scoreNode)
                {
                    nodesToScore.add(currentNode);
                }
            }
            
            if( tagName.equals("div") )
            {
                TagNode[] children = currentNode.getAllElements(true);
                boolean convertNode = true;
                for( int j = 0; j < children.length; ++j)
                {
                    String childName = children[j].getName();
                    if( mDivToPElements.matcher( childName ).find() )
                    {
                        convertNode = false;
                        break;
                    }
                }
                if (convertNode)
                {
                    // System.out.println("coverted div oops" + mCleaner.getInnerHtml(currentNode));
                    currentNode.setName("p");
                    nodesToScore.add(currentNode);
                }
            }
        }
        
        
        // once we get to here, nodesToScore contains everything we want to process for content
        StringBuilder innerText = new StringBuilder();
        for( int i = 0; i < nodesToScore.size(); ++i)
        {
            TagNode node = nodesToScore.get(i);
            
            // System.out.println( node.getName() + " " + node.getText() );
            
            TagNode parentNode = node.getParent();
            
            if( parentNode == null)
            {
                continue;
            }
            TagNode grandParentNode = parentNode.getParent();
            
            if( node.getTextLength() < 25 )
            {
                continue;
            }
            
            innerText.setLength(0);
            node.getText(innerText);
            
            int contentScore = 0;
            
            /* Add a point for the paragraph itself as a base. */
            contentScore++;
            
            contentScore += getCharCount( innerText, ",");

            /* For every 100 characters in this paragraph, add another point. Up to 3 points. */
            contentScore += Math.min(innerText.length() / 100, 3);
            
            /* try link density - -3 if the whole thing is a link */
            int linkDensity = (int) (getLinkDensity(node) * -3);
            // System.out.println("link density is " + linkDensity);
            contentScore += linkDensity;
            
            // if(Globals.LOGGING) Log.d(Globals.LOG_TAG, "got score " + contentScore + "for " + node.getName() + " " + node.getAttributeByName("class") + ":" +  node.getAttributeByName("id"));
            
            /* Add the score to the parent. The grandparent gets half. */
            addScore(node, contentScore);
            addScore(parentNode, contentScore);

            if(grandParentNode != null) addScore(grandParentNode, contentScore / 2 );
        }
        
        // lets have a look at the score
        TagNode[] scoreKeys = mTagScores.keySet().toArray(new TagNode[0]);
        TagNode bestNode = null;
        int bestScore = 0;
        for( int i = 0; i < scoreKeys.length; ++i)
        {
            TagNode node = scoreKeys[i];
            int score = mTagScores.get(node);
            
            if( Globals.LOGGING )
            {
                Log.d(Globals.LOG_TAG, "score " + node.getName() + " " + node.getAttributeByName("class") + " " + node.getAttributeByName("id") + " = " + score);
            }
            
            if( bestNode == null || score > bestScore)
            {
                bestScore = score;
                bestNode = node;
            }
        }
        
        mBestScore = bestScore;
        
        if( bestNode == null )
        {
            bestNode = root.findElementByName("body", true);
        }
        
        if( bestNode == null)
        {
            return null;
        }
        
        if( Globals.LOGGING) Log.d(Globals.LOG_TAG, "The best score is " + bestScore + " " + bestNode.getText());
        
        
        // now we are going to look at the siblings of the best node to see what the output should be
        int siblingScoreThreshold = (int) Math.max(10, bestScore * 0.2 );
        
        @SuppressWarnings("rawtypes")
        List siblingNodes = bestNode.getParent().getChildren();
        
        ArrayList<TagNode> outputNodes = new ArrayList<TagNode>(); 
        
        StringBuilder content = new StringBuilder();
        for(Object o: siblingNodes)
        {
            if ( ! ( o instanceof TagNode ) )
            {
                continue;
            }
            
            TagNode sibling = (TagNode) o;
            
            String siblingName = sibling.getName();
            
            Integer siblingScore = mTagScores.get(sibling);
            
            boolean output = false;
            
            int bonusScore = 0;
            
            // Give a bonus if sibling nodes and top candidates have the example same class
            if( sibling.getAttributeByName("class") != null && sibling.getAttributeByName("class").equals(bestNode.getAttributeByName("class")) )
            {
                bonusScore += bestScore / 5;
            }
            
            if( sibling == bestNode )
            {
                output = true;
            }
            
            
            if( siblingScore != null && (siblingScore + bonusScore) >= siblingScoreThreshold )
            {
                output = true;
            }
            else if( siblingName.equals("p"))
            {
                double linkDensity = getLinkDensity(sibling);
                content.setLength(0);
                sibling.getText(content);
                
                if( content.length() > 80 && linkDensity < 0.25 )
                {
                    output = true;
                }
                else if ( content.length() < 80 && linkDensity == 0 && mSentencePattern.matcher( content ).find() )
                {
                    output = true;
                }
            }
            
            if( output )
            {
                if( siblingName.equals("div") || siblingName.equals("p") )
                {
                }
                else
                {
                    // System.out.println("forcing node type " + siblingName + " to div for output");
                    // the node is not a div or p, something trickier
                    sibling.setName("div");
                }
                
                outputNodes.add( sibling );
            }
        }
        
        TagNode article = new TagNode("div");
        
        for( int i = 0; i < outputNodes.size(); ++i )
        {
            TagNode node = outputNodes.get(i);
            node.removeFromTree();
            article.addChild(node);
        }
        
        
        return article;
    }
    
    void cleanArticle(TagNode article)
    {
        // readability.cleanStyles(articleContent);
        // readability.killBreaks(articleContent); // TODO - remove consecutive <br> <br> <br>
        
        // System.out.println("starting to clean article" + mCleaner.getInnerHtml(article));

        /* Clean out junk from the article */
        cleanConditionally(article, "form");
        clean(article, "object");
        clean(article, "h1");

        /**
         * If there is only one h2, they are probably using it
         * as a header and not a subheader, so remove it since we already have a header.
        ***/
        if(article.getElementListByName("h2", true).size() == 1)
        {
            clean(article, "h2");
        }
        clean(article, "iframe");

        cleanHeaders(article);

        /* Do these last as the previous stuff may have removed junk that will affect these */
        cleanConditionally(article, "table");
        cleanConditionally(article, "ul");
        cleanConditionally(article, "div");

        /* Remove extra paragraphs */
        TagNode[] articleParagraphs = article.getElementsByName("p", true);
        for(int i = 0; i < articleParagraphs.length; ++i)
        {
            TagNode p = articleParagraphs[i];
            int imgCount    = p.getElementListByName("img", true).size();
            int embedCount  = p.getElementListByName("embed", true).size();
            int objectCount = p.getElementListByName("object", true).size();
            
            if (imgCount == 0 && embedCount == 0 && objectCount == 0 && p.getTextLength() == 0)
            {
                p.removeFromTree();
            }
        }

        // TODO - replace <br><p> with <p> ?
        // articleContent.innerHTML = articleContent.innerHTML.replace(/<br[^>]*>\s*<p/gi, '<p');      
    }
    
    void clean(TagNode root, String tag)
    {
        // TODO - if tag is object or embed, check for youtubeage
        TagNode[] nodes = root.getElementsByName(tag, true);
        
        for( int i = 0; i < nodes.length; ++i)
        {
            nodes[i].removeFromTree();
        }
    }
    
    /**
     * remove h1 and h2 tags if they are junk
     * @param root
     */
    void cleanHeaders(TagNode root)
    {
        for (int headerIndex = 1; headerIndex < 3; headerIndex++)
        {
            TagNode[] headers = root.getElementsByName("h" + headerIndex, true);
            for (int i = 0; i < headers.length; ++i)
            {
                if (getClassWeight(headers[i]) < 0 || getLinkDensity(headers[i]) > 0.33)
                {
                    headers[i].removeFromTree();
                }
            }
        }
    }
    
    
    void cleanConditionally(TagNode root, String tag)
    {
        TagNode[] tags = root.getElementsByName(tag, true);
        
        StringBuilder nodeText = new StringBuilder();

        for( int i = 0; i < tags.length; ++i)
        {
            TagNode node = tags[i];
            
            int weight = getClassWeight(node);
            int contentScore = 0;
            
            Integer tmp = mTagScores.get(node);
            if( tmp != null )
            {
                contentScore = tmp;
            }
            
            // System.out.println("Cleaning Conditionally " + node.getName() + " (" + node.getAttributeByName("class") + ":" + node.getAttributeByName("id") + ")" + contentScore);
            
            nodeText.setLength(0);
            node.getText( nodeText );
            
            if(weight+contentScore < 0)
            {
                // System.out.println("removing because of weight " + (weight + contentScore));
                node.removeFromTree();
            }
            else if ( getCharCount(nodeText, ",") < 10)
            {
                /**
                 * If there are not very many commas, and the number of
                 * non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
                **/
                int p      = node.getElementListByName("p", true).size();
                int img    = node.getElementListByName("img", true).size();
                int li     = node.getElementListByName("li", true).size()-100;
                int input  = node.getElementListByName("input", true).size();
                int embed  = node.getElementListByName("embed", true).size();
                
                // TODO - look for youtubes and keep them
                
                double linkDensity   = getLinkDensity( node );
                int contentLength = nodeText.length();
                boolean toRemove      = false;
                boolean forceKeep     = false;
                
                // System.out.println("p " + p + " img " + img + " weight " + weight + " linkDensity " + linkDensity + " embed " + embed + " content length " + contentLength + node.getText());

                if ( img > p && img > 1 ) // TODO - added this img > 1 check, see how bad this makes things
                {
                    // System.out.println("removing 1");
                    toRemove = true;
                }
                else if( li > p && ! tag.equals("ul") && ! tag.equals("ol") )
                {
                    // System.out.println("removing 2");
                    toRemove = true;
                }
                else if( input > p/3 )
                {
                    // System.out.println("removing 3");
                    toRemove = true; 
                }
                else if(contentLength < 25 && (img == 0 || img > 2) )
                {
                    // System.out.println("removing 4");
                    toRemove = true;
                }
                else if(weight < 25 && linkDensity > 0.2)
                {
                    // System.out.println("removing 5");
                    toRemove = true;
                }
                else if(weight >= 25 && linkDensity > 0.5)
                {
                    // System.out.println("removing 6");
                    toRemove = true;
                }
                else if((embed == 1 && contentLength < 75) || embed > 1)
                {
                    // System.out.println("removing 7");
                    toRemove = true;
                }
                
                // TODO - dirty hack for bbc image galleries
                if( "galMain".equals(node.getAttributeByName("class")))
                {
                    forceKeep = true;
                }
                
                if (toRemove && !forceKeep)
                {
                    // System.out.println("removing");
                    node.removeFromTree();
                }
                else
                {
                    // System.out.println("stays, hooray");
                }
            }
            else
            {
                // System.out.println("stays, heaps of commas");
            }
        }
    }
    
    private int getCharCount(StringBuilder text, String c)
    {
        int count = 0;
        
        /* Add points for any commas within this paragraph */
        int index = -1;
        while( ( index = text.indexOf(c, index+1) ) != -1 )
        {
            ++count;
        }
        
        return count;
    }
    

    public String getProcessedArticle()
    {
        TagNode html = new TagNode("html");
        TagNode head = new TagNode("head");
        TagNode title = new TagNode("title");
        title.addChild( new ContentNode(mTitle) );
        head.addChild( title );
        html.addChild( head );
        
        if( mViewport != null )
        {
            TagNode viewport = new TagNode("meta");
            viewport.setAttribute("name", "viewport");
            viewport.setAttribute("content", mViewport);
            head.addChild(viewport);
        }
        
        TagNode styleSheetNode = new TagNode("style");
        styleSheetNode.setAttribute("type", "text/css");
        styleSheetNode.addChild(new ContentNode("" +
        		"DIV { font-family: sans-serif }" +
        		"P   { font-family: sans-serif }" +
        		"H1  { text-align: center; font-family: serif }" +
        		"IMG { max-width: 100% ; height: auto } "
                )
        );
        head.addChild(styleSheetNode);
        
        if( mForcePageWidth )
        {
            TagNode viewportNode = new TagNode("meta");
            viewportNode.setAttribute("name", "viewport");
            viewportNode.setAttribute("content", "width=device-width");
            head.addChild(viewportNode);
        }
        
        if( mArticleRoot.getName().equals("body"))
        {
            mArticleRoot.setName("div");
        }
        
        TagNode body = new TagNode("body");
        
        html.addChild( body );
        
        
        TagNode content = new TagNode("div"); // TODO - set class
        TagNode header = new TagNode("h1");
        header.addChild( new ContentNode( mTitle) );
        content.addChild(header);
        
        if( mAuthor != null && mAuthor.length() > 0 )
        {
            header = new TagNode("div");
            header.addChild( new ContentNode( mAuthor ) );
            content.addChild(header);
        }
        
        if( mPubDate != null && mPubDate.length() > 0 )
        {
            header = new TagNode("div");
            header.addChild( new ContentNode( mPubDate ) );
            content.addChild(header);
        }
        
        content.addChild(new TagNode("hr"));
        content.addChild( mArticleRoot );
        body.addChild(content);
    
        html.setDocType(new DoctypeToken("html", "PUBLIC", "-//W3C//DTD HTML 4.01//EN", "http://www.w3.org/TR/html4/strict.dtd"));
        
        mArticleRoot.traverse(new TagNodeVisitor() {
            
            public boolean visit(TagNode tagNode, org.htmlcleaner.HtmlNode htmlNode)
            {
                if (htmlNode instanceof TagNode)
                {
                    TagNode tag = (TagNode) htmlNode;
                    String tagName = tag.getName();
                    
                    if( mBlacklistTags.contains(tagName) )
                    {
                        tag.removeFromTree();
                    }
                    else if (mConvertTags.containsKey(tagName))
                    {
                        tagName = mConvertTags.get(tagName);
                        tag.setName(tagName);
                    }
                    
                    Set<String> allowedAttributes = mAllowedAttributes.get( tagName );
                    
                    // convert unknown tags to div
                    if( allowedAttributes == null )
                    {
                        tagName = "div";
                        tag.setName(tagName);
                        tag.getAttributes().clear();
                    }
                    else
                    {
                        Map<String,String> attributesMap = tag.getAttributes();
                        String[] attributeNames = attributesMap.keySet().toArray(new String[0]);
                        
                        for( int i = 0; i < attributeNames.length; ++i )
                        {
                            String name = attributeNames[i];
                            if( ! allowedAttributes.contains( name ) )
                            {
                                attributesMap.remove(name);
                            }
                        }
                    }
                }
                // tells visitor to continue traversing the DOM tree
                return true;
            }
        });
        
        try
        {
            return new BrowserCompactXmlSerializer( mCleaner.getProperties() ).getAsString( html );
        }
        catch( IOException exc )
        {
            return "";
        }
    }
    
    /**
     * replace all <br><br> in the body with </p><p> for better parsing
     * TODO - see if there are any frames any maybe use those instead, readability looks at how big the frames are on the screen
     * * @param root
     */
    void preprocess(TagNode root)
    {
        TagNode body = root.findElementByName("body", true);
        
        final ArrayList<Object> contentToTransform = new ArrayList<Object>();
        final ArrayList<TagNode> brsToRemove = new ArrayList<TagNode>();
        
        if( body != null )
        {
            // traverse whole DOM and update images to absolute URLs
            body.traverse(new TagNodeVisitor()
            {
                public boolean visit(TagNode tagNode, HtmlNode htmlNode)
                {
                    if (htmlNode instanceof TagNode)
                    {
                        TagNode tag = (TagNode) htmlNode;
                        
                        // grab all child nodes and see if there are any duplicate <br><br> tags
                        Object[] children = tag.getChildren().toArray();
                        contentToTransform.clear();
                        brsToRemove.clear();
                        int brCount = 0;
                        boolean hasDoneTransform = false;
                        
                        // so we are scanning for <br>(whitespace)<br>(content)
                        for( int i = 0; i < children.length + 1; ++i )
                        {
                            Object child = null;
                            
                            if( i < children.length )
                            {
                                child = children[i];
                            }
                            
                            boolean isBr = false;
                            boolean doTransform = false;
                            
                            if( child instanceof TagNode )
                            {
                                TagNode childTag = (TagNode) child;
                                if( childTag.getName().equals("br") )
                                {
                                    isBr = true;
                                    
                                    brsToRemove.add(childTag);
                                    
                                    // we have 2 brs previously and then some content, now this br, time to transform
                                    if( brCount > 1 && contentToTransform.size() > 0 )
                                    {
                                        doTransform = true;
                                    }
                                    brCount += 1;
                                }
                            }
                            
                            // if we have found a br previously, see if the next node is junk, if so we keep scanning
                            // otherwise we reset
                            if( brCount == 1 && !isBr && child != null)
                            {
                                boolean reset = true;
                                if( child instanceof ContentNode )
                                {
                                    ContentNode childContent = (ContentNode) child;
                                    String content = childContent.getContent().toString().trim();
                                    if( content.length() == 0)
                                    {
                                        reset = false;
                                    }
                                }
                                
                                if( reset )
                                {
                                    // System.out.println("resetting on " + child + ":" + child.getClass());
                                    brCount = 0;
                                    hasDoneTransform = false;
                                    contentToTransform.clear();
                                    brsToRemove.clear();
                                }
                            }
                            
                            if( brCount > 0 && !isBr && child != null)
                            {
                                // System.out.println("Appending tranformation candidate " + contentToTransform.size());
                                contentToTransform.add(child);
                            }
                            
                            if( brCount > 1 && child == null && hasDoneTransform)
                            {
                                doTransform = true;
                            }
                            
                            if( doTransform )
                            {
                                // System.out.println("PerformingTransform");
                                hasDoneTransform = true;
                                
                                TagNode newParagraph = null;
                                
                                if( brsToRemove.size() > 0 )
                                {
                                    newParagraph = brsToRemove.get(0);
                                }
                                else // this is the last paragraph, make a new one
                                {
                                    newParagraph = new TagNode("p");
                                    tag.addChild(newParagraph);
                                    hasDoneTransform = false;
                                }
                                
                                newParagraph.setName("p"); // turn silly old <br> into a shiny <p> to stick the new content under
                                
                                for(int j = 0; j < contentToTransform.size(); ++j)
                                {
                                    Object transformChild = contentToTransform.get(j);
                                    
                                    if( transformChild instanceof TagNode )
                                    {
                                        ((TagNode) transformChild).removeFromTree();
                                    }
                                    
                                    tag.removeChild(transformChild);
                                    
                                    newParagraph.addChild( transformChild );
                                }
                                
                                for( int j = 1; j < brsToRemove.size() - 1; ++j )
                                {
                                    brsToRemove.get(j).removeFromTree();
                                }
                                
                                // System.out.println("new content is " + mCleaner.getInnerHtml(newParagraph));
                                
                                contentToTransform.clear();
                                TagNode savedBr = brsToRemove.get( brsToRemove.size() - 1);
                                brsToRemove.clear();
                                brsToRemove.add(savedBr);
                                brCount = 2; // initilise in state as if we have previously seen some brs so consective paragraphs get transformed
                            }
                        }
                        
                        // System.out.println("finished processing children");
                        
                        contentToTransform.clear();
                        brsToRemove.clear();
                    }
                    // tells visitor to continue traversing the DOM tree
                    return true;
                }
            });
        }
    }
}